In [1]:
import pandas as pd
import numpy as np

In [2]:
from autogluon.tabular import TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


# Autogluon 기본 사용법
* 1. 데이터 로딩
* 2. 타겟변수 지정
* 3. TabularPredictor 설정, (타겟변수, 모델 성능 지표) - 모델설정
* 4. 훈련(데이터, 제한시간설정, 분석사전설정 지정) - .fit()
* 5. 데이터에서 일부 데이터를 테스트 데이터로 추출 = .sample()
* 6. 분석이 끝난 모델로 테스트 데이터에서 추론 .predict()
* 7. 평가

# 1. 데이터 로딩

In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ADsP/main/salary2.csv")
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   education       48842 non-null  object
 3   education-num   48842 non-null  int64 
 4   marital-status  48842 non-null  object
 5   occupation      46033 non-null  object
 6   relationship    48842 non-null  object
 7   race            48842 non-null  object
 8   sex             48842 non-null  object
 9   capital-gain    48842 non-null  int64 
 10  capital-loss    48842 non-null  int64 
 11  hours-per-week  48842 non-null  int64 
 12  native-country  47985 non-null  object
 13  class           48842 non-null  object
dtypes: int64(5), object(9)
memory usage: 5.2+ MB


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_data, test_data = train_test_split(data, stratify=data['class'], test_size=0.4, random_state=10)

In [7]:
train_data

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
12689,49,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,35,United-States,<=50K
30011,45,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,44,United-States,>50K
28454,55,Local-gov,HS-grad,9,Married-civ-spouse,Prof-specialty,Other-relative,White,Female,0,2246,40,United-States,>50K
17394,69,Private,HS-grad,9,Married-civ-spouse,Sales,Wife,White,Female,0,0,40,United-States,<=50K
31597,21,Private,Some-college,10,Never-married,Machine-op-inspct,Own-child,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20424,31,Private,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
28686,18,,HS-grad,9,Never-married,,Own-child,White,Female,0,0,40,United-States,<=50K
30381,28,Private,Some-college,10,Never-married,Transport-moving,Not-in-family,White,Male,0,0,50,United-States,<=50K
48693,61,Private,Some-college,10,Divorced,Other-service,Not-in-family,Black,Male,0,0,40,United-States,<=50K


In [8]:
test_data 

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
22332,56,,HS-grad,9,Divorced,,Not-in-family,White,Male,0,0,10,United-States,<=50K
33515,57,Private,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,42,United-States,<=50K
39475,37,State-gov,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,45,United-States,<=50K
11140,19,Private,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,0,0,20,United-States,<=50K
39998,41,State-gov,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43439,21,Private,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,45,United-States,<=50K
3527,28,Self-emp-inc,Bachelors,13,Never-married,Exec-managerial,Own-child,White,Male,0,0,60,United-States,<=50K
46661,27,Private,HS-grad,9,Separated,Machine-op-inspct,Not-in-family,White,Male,0,0,43,United-States,<=50K
28427,38,Private,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,<=50K


# 2. 타겟변수 지정

In [9]:
target_column = "class"

# 3. 제한시간, 검정지표(평가지표, accuracy, rmse, roc_auc) 지정  

In [10]:
# 초 단위로 제한시간 지정  300초는 5분, 
time_limit = 300
# 성능지표(accuracy, roc_auc, root_mean_squared_error, r2, f1, recall, precision, roc_auc, mean_squared_error) 
metric = 'accuracy'

# 4. 모델정의 TablularPredictor()

In [11]:
model = TabularPredictor(label=target_column, eval_metric=metric)

No path specified. Models will be saved in: "AutogluonModels/ag-20241022_002137"


# 5. 모델 훈련 

In [12]:
model.fit(train_data, time_limit=time_limit, presets='medium_quality')

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Mar 29 23:14:13 UTC 2024
CPU Count:          8
Memory Avail:       13.10 GB / 14.61 GB (89.6%)
Disk Space Avail:   0.95 GB / 223.03 GB (0.4%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels/ag-20241022_002137"
Train Data Rows:    29305
Train Data Columns: 13
Label Column:       class
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [' <=50K', ' >50K']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Probl

TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20241022_002137")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fb4d1bb08b0>

# 6. 생성된 모델에 테스트 데이터 넣어 예측하기

In [13]:
pred = model.predict(test_data)

# 7. 모델 성능 평가하기

In [14]:
result = model.evaluate(test_data)
result_df = pd.DataFrame([result], index=[0])
# 여러 모델 성능 비교
leader_board = model.leaderboard(test_data)
# 중요 변수 출력
feature_importance = model.feature_importance(test_data)
best_model_name = model.model_best
# best모델 로딩
best_model = model._trainer.load_model(best_model_name)
best_model_params = best_model.params

Computing feature importance via permutation shuffling for 13 features using 5000 rows with 5 shuffle sets...
	296.26s	= Expected runtime (59.25s per shuffle set)
	68.7s	= Actual runtime (Completed 5 of 5 shuffle sets)


# 8 결과 출력

In [15]:
print("="*20, "result_df", "="*20)
display(result_df)
print()
print("="*20, "leader_board", "="*20)
display(leader_board)
print()
print("="*20, "feature_importance", "="*20)
display(feature_importance)
print()
print("="*20, "best_model_name, params", "="*20)
print("best_model_name: ", best_model_name, "\nparams: ", best_model_params)
print()



Unnamed: 0,accuracy,balanced_accuracy,mcc,roc_auc,f1,precision,recall
0,0.858013,0.754925,0.579951,0.893546,0.652555,0.787247,0.557219





Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.858013,0.8612,accuracy,6.045064,0.686261,9.436073,0.013119,0.001346,0.271034,2,True,7
1,RandomForestGini,0.855044,0.8504,accuracy,1.589078,0.156996,3.004608,1.589078,0.156996,3.004608,1,True,3
2,RandomForestEntr,0.854891,0.8512,accuracy,1.590624,0.193089,3.360249,1.590624,0.193089,3.360249,1,True,4
3,ExtraTreesEntr,0.84967,0.8432,accuracy,1.862982,0.158352,2.614357,1.862982,0.158352,2.614357,1,True,6
4,ExtraTreesGini,0.849209,0.8428,accuracy,1.822742,0.156025,2.695044,1.822742,0.156025,2.695044,1,True,5
5,KNeighborsUnif,0.829708,0.8284,accuracy,0.517208,0.094739,0.059859,0.517208,0.094739,0.059859,1,True,1
6,KNeighborsDist,0.827251,0.8256,accuracy,0.512292,0.084066,0.045279,0.512292,0.084066,0.045279,1,True,2





Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
capital-gain,0.05828,0.003738,2e-06,5,0.065976,0.050584
education-num,0.02756,0.003817,4.3e-05,5,0.035419,0.019701
capital-loss,0.02576,0.003109,2.5e-05,5,0.032162,0.019358
age,0.019,0.002205,2.1e-05,5,0.023539,0.014461
relationship,0.01072,0.001616,6e-05,5,0.014048,0.007392
hours-per-week,0.01032,0.0031,0.00087,5,0.016704,0.003936
marital-status,0.00616,0.001212,0.000171,5,0.008655,0.003665
sex,0.00548,0.002043,0.001942,5,0.009686,0.001274
occupation,0.00512,0.001559,0.000916,5,0.008331,0.001909
workclass,0.00148,0.001628,0.055968,5,0.004833,-0.001873



best_model_name:  WeightedEnsemble_L2 
params:  {'use_orig_features': False, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}



# 함수화하고 분석 간단히 하기

In [16]:
def automl(data, target, time=300, metric=None):
     
    if metric in ("accuracy", "roc_auc", "recall", "precision", "f1"):
        train_data, test_data = train_test_split(data, stratify=data[target], test_size=0.4, random_state=10)
    else:
        train_data, test_data = train_test_split(data, test_size=0.4, random_state=10)
        
    model = TabularPredictor(label=target, eval_metric=metric)
    model.fit(train_data, time_limit=time, presets='medium_quality')
    pred = model.predict(test_data)

    result = model.evaluate(test_data)
    result_df = pd.DataFrame([result], index=[0])
    # 여러 모델 성능 비교
    leader_board = model.leaderboard(test_data)
    # 중요 변수 출력
    feature_importance = model.feature_importance(test_data)
    best_model_name = model.model_best
    # best모델 로딩
    best_model = model._trainer.load_model(best_model_name)
    best_model_params = best_model.params

    print("="*20, "result_df", "="*20)
    display(result_df)
    print()
    print("="*20, "leader_board", "="*20)
    display(leader_board)
    print()
    print("="*20, "feature_importance", "="*20)
    display(feature_importance)
    print()
    print("="*20, "best_model_name, params", "="*20)
    print("best_model_name: ", best_model_name, "\nparams: ", best_model_params)
    print()
    return best_model, result_df, leader_board, feature_importance

In [17]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ablearn/main/Taitanic_train.csv")
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [18]:
automl(data, 'Survived')

No path specified. Models will be saved in: "AutogluonModels/ag-20241022_002332"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Mar 29 23:14:13 UTC 2024
CPU Count:          8
Memory Avail:       12.22 GB / 14.61 GB (83.6%)
Disk Space Avail:   0.05 GB / 223.03 GB (0.0%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels/ag-20241022_002332"
Train Data Rows:    534
Train Data Columns: 11
Label Column:       Survived
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type

	Ensemble Weights: {'RandomForestEntr': 1.0}
	0.8224	 = Validation score   (accuracy)
	0.1s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 6.86s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1653.0 rows/s (107 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20241022_002332")
Computing feature importance via permutation shuffling for 11 features using 357 rows with 5 shuffle sets...
	13.97s	= Expected runtime (2.79s per shuffle set)
	1.8s	= Actual runtime (Completed 5 of 5 shuffle sets)




Unnamed: 0,accuracy,balanced_accuracy,mcc,roc_auc,f1,precision,recall
0,0.806723,0.782154,0.574086,0.857806,0.720648,0.747899,0.695312





Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr,0.806723,0.82243,accuracy,0.183741,0.063972,1.143837,0.183741,0.063972,1.143837,1,True,4
1,WeightedEnsemble_L2,0.806723,0.82243,accuracy,0.189841,0.064731,1.239845,0.0061,0.000759,0.096008,2,True,7
2,RandomForestGini,0.803922,0.813084,accuracy,0.18261,0.108687,1.18812,0.18261,0.108687,1.18812,1,True,3
3,ExtraTreesEntr,0.80112,0.813084,accuracy,0.197948,0.077438,0.90415,0.197948,0.077438,0.90415,1,True,6
4,ExtraTreesGini,0.784314,0.785047,accuracy,0.188187,0.067574,1.03313,0.188187,0.067574,1.03313,1,True,5
5,KNeighborsDist,0.669468,0.551402,accuracy,0.007793,0.003088,0.010517,0.007793,0.003088,0.010517,1,True,2
6,KNeighborsUnif,0.666667,0.579439,accuracy,0.01069,0.051159,0.015734,0.01069,0.051159,0.015734,1,True,1





Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
Name,0.104762,0.014501,4.3e-05,5,0.13462,0.074904
Sex,0.054902,0.014769,0.000572,5,0.085312,0.024492
Pclass,0.009524,0.003758,0.002391,5,0.017262,0.001786
Fare,0.006162,0.008262,0.085339,5,0.023174,-0.010849
SibSp,0.002801,0.005942,0.175651,5,0.015036,-0.009434
Parch,0.00112,0.002505,0.18695,5,0.006279,-0.004038
Embarked,0.0,0.004429,0.5,5,0.009119,-0.009119
Ticket,-0.00056,0.002344,0.689346,5,0.004265,-0.005386
Age,-0.00112,0.00899,0.602853,5,0.01739,-0.019631
PassengerId,-0.003361,0.004155,0.927648,5,0.005193,-0.011916



best_model_name:  WeightedEnsemble_L2 
params:  {'use_orig_features': False, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}



(<autogluon.core.models.ensemble.weighted_ensemble_model.WeightedEnsembleModel at 0x7fb4d1d7d090>,
    accuracy  balanced_accuracy       mcc   roc_auc        f1  precision  \
 0  0.806723           0.782154  0.574086  0.857806  0.720648   0.747899   
 
      recall  
 0  0.695312  ,
                  model  score_test  score_val eval_metric  pred_time_test  \
 0     RandomForestEntr    0.806723   0.822430    accuracy        0.183741   
 1  WeightedEnsemble_L2    0.806723   0.822430    accuracy        0.189841   
 2     RandomForestGini    0.803922   0.813084    accuracy        0.182610   
 3       ExtraTreesEntr    0.801120   0.813084    accuracy        0.197948   
 4       ExtraTreesGini    0.784314   0.785047    accuracy        0.188187   
 5       KNeighborsDist    0.669468   0.551402    accuracy        0.007793   
 6       KNeighborsUnif    0.666667   0.579439    accuracy        0.010690   
 
    pred_time_val  fit_time  pred_time_test_marginal  pred_time_val_marginal  \
 0       0

In [19]:
!pip install gradio

[0m

In [20]:
import gradio as gr
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

# automl 함수 정의
def automl(data, target, time=300, metric=None):
    if metric in ("accuracy", "roc_auc", "recall", "precision", "f1"):
        train_data, test_data = train_test_split(data, stratify=data[target], test_size=0.4, random_state=10)
    else:
        train_data, test_data = train_test_split(data, test_size=0.4, random_state=10)
        
    model = TabularPredictor(label=target, eval_metric=metric)
    model.fit(train_data, time_limit=time, presets='medium_quality')
    pred = model.predict(test_data)

    result = model.evaluate(test_data)
    result_df = pd.DataFrame([result], index=[0])
    # 여러 모델 성능 비교
    leader_board = model.leaderboard(test_data)
    # 중요 변수 출력
    feature_importance = model.feature_importance(test_data)
    best_model_name = model.model_best
    # best모델 로딩
    best_model = model._trainer.load_model(best_model_name)
    best_model_params = best_model.params

    return result_df, leader_board, feature_importance

# 데이터 미리보기 (파일 업로드 후 head(3))
def preview_data(file):
    data = pd.read_csv(file.name)
    return data.head(3)

# Gradio 인터페이스 정의
def gradio_automl(file, target, time, metric):
    # CSV 파일을 pandas 데이터프레임으로 변환
    data = pd.read_csv(file.name)

    # automl 함수 호출
    result_df, leader_board, feature_importance = automl(data, target, time, metric)
    
    return result_df, leader_board, feature_importance

# Gradio 인터페이스 생성
with gr.Blocks() as demo:
    file_input = gr.File(label="CSV 데이터 파일")
    data_preview = gr.Dataframe(label="데이터 미리보기 (head 3)")
    target_input = gr.Textbox(label="타겟 변수 이름")
    time_input = gr.Number(label="분석 시간 (초)", value=300)
    metric_input = gr.Dropdown(choices=["accuracy", "roc_auc", "recall", "precision", "f1", None], label="성능 지표")
    
    # 분석 결과 출력 (3개의 데이터프레임)
    result_output = gr.Dataframe(label="Result Dataframe")
    leaderboard_output = gr.Dataframe(label="Leader Board")
    feature_importance_output = gr.Dataframe(label="Feature Importance")

    # 파일 업로드 시 데이터 미리보기 업데이트
    file_input.change(fn=preview_data, inputs=file_input, outputs=data_preview)

    # 분석 실행
    submit_button = gr.Button("분석 실행")
    submit_button.click(fn=gradio_automl, inputs=[file_input, target_input, time_input, metric_input], outputs=[result_output, leaderboard_output, feature_importance_output])

# Gradio 앱 실행 (공개 링크 제공)
demo.launch(inline=False, share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://0c12e103b145c43f34.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


