In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor

import pandas as pd

In [2]:
data = pd.read_csv('extended_stoma_dataset.csv')

data['end_year'] = data.dates.apply(lambda a: int(a.split('-')[1]))
data['birth_year'] = data['date_of_birth'].apply(lambda a: int('20'+a.split('.')[-1] ))
data['right_age'] = data[['end_year', 'birth_year']].apply(lambda a: a[0]-a[1], axis=1)
data['kpu'] = data[['num_caries', 'num_filling_caries',
                    'num_filling_no_caries',
                    'num_removed_caused_by_caries', 'num_removed_other_reasons']].sum(axis=1)

train = data.drop(data[data['dates'] == '2017-2018'].index, axis=0)
test = data[data['dates'] == '2017-2018']

NUM_FEATURES = [
       'prosthesis crown veneer', 'sealed fissure', 'uncut tooth', 'fluorosis',
               'sum_hygiene_index', 'right_age', ]
LAST_FEATURES = [ 'not_registered', 'num_caries',
       'num_filling_caries', 'num_filling_no_caries', 'num_healthy',
       'num_removed_caused_by_caries', 'num_removed_other_reasons',] #признаки, участвующие в расчете целевой метрики, берем их только за последний год
TARGET = ['kpu']
CAT_FEATURES = ['nationality', 'place_of_birth',
               'previous_place_of_living', 'bite_type', ]

x_mean = train[NUM_FEATURES+['id']].groupby('id').mean()
x_cat = train[CAT_FEATURES+['id']].groupby('id').agg(lambda x:x.value_counts().index[0])
last_names = [c+' last' for c in NUM_FEATURES]
x_last = pd.DataFrame(columns=[c+' last' for c in NUM_FEATURES])
x_last[last_names] = train[NUM_FEATURES+['id']].groupby('id').last()

last_cor_target_names = [c+' previous year' for c in LAST_FEATURES]
x_prev_year = pd.DataFrame(columns=last_cor_target_names)
x_prev_year = train[LAST_FEATURES+['id']].groupby('id').agg(lambda x: x.iloc[:len(x)-1].iloc[-1])

x_past_target = train[TARGET+['id']].groupby('id').agg(lambda x: x.iloc[:len(x)-1].iloc[-1])
x_past_target.columns = ['kpu_last']
x_past_target['kpu'] = train[TARGET+['id']].groupby('id').agg(lambda x: x.iloc[-1])

x_mean_test = data[NUM_FEATURES+['id']].groupby('id').mean()
x_cat_test = data[CAT_FEATURES+['id']].groupby('id').agg(lambda x:x.value_counts().index[0])
last_names = [c+' last' for c in NUM_FEATURES]
x_last_test = pd.DataFrame(columns=[c+' last' for c in NUM_FEATURES])
x_last_test[last_names] = data[NUM_FEATURES+['id']].groupby('id').last()

last_cor_target_names = [c+' previous year' for c in LAST_FEATURES]
x_prev_year_test = pd.DataFrame(columns=last_cor_target_names)
x_prev_year_test = data[LAST_FEATURES+['id']].groupby('id').agg(lambda x: x.iloc[:len(x)-1].iloc[-1])

x_past_target_test = data[TARGET+['id']].groupby('id').agg(lambda x: x.iloc[:len(x)-1].iloc[-1])
x_past_target_test.columns = ['kpu_last']
x_past_target_test['kpu'] = data[TARGET+['id']].groupby('id').agg(lambda x: x.iloc[-1])

x_train = pd.concat([x_mean, x_cat, x_last,x_prev_year_test, x_past_target], axis=1)
y_train = x_past_target['kpu']

x_test = pd.concat([x_mean_test, x_cat_test, x_last_test, x_prev_year_test, x_past_target_test['kpu_last']], axis=1)
x_test_with_kpu = pd.concat([x_mean_test, x_cat_test, x_last_test, x_prev_year_test, x_past_target_test], axis=1)
# y_test = test['kpu']
y_test = test[['id', 'kpu']].groupby('id').last()

In [3]:
save_path = 'agModels-predictClass'  # specifies folder to store trained models

predictor = TabularPredictor(label='kpu', path=save_path).fit(x_train)

Beginning AutoGluon training ...
AutoGluon will save models to "agModels-predictClass/"
AutoGluon Version:  0.4.2
Python Version:     3.8.10
Operating System:   Linux
Train Data Rows:    191
Train Data Columns: 24
Label Column: kpu
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	First 10 (of 16) unique label values:  [7, 6, 9, 2, 1, 8, 3, 4, 12, 5]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Fraction of data from classes with at least 2 examples that will be kept for training models: 0.9790575916230366
Train Data Class Count: 12
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    10974.09 MB
	Train Data (Original)  Memory Usage: 0.1 MB (0.0

In [4]:
y_true=y_test.squeeze()

In [5]:
y_test

Unnamed: 0_level_0,kpu
id,Unnamed: 1_level_1
1101.xlsx,9
1102.xlsx,6
1103.xlsx,6
1104.xlsx,9
1105.xlsx,3
...,...
7412.xlsx,6
7413.xlsx,5
7414.xlsx,6
7415.xlsx,3


In [6]:
y_true

id
1101.xlsx    9
1102.xlsx    6
1103.xlsx    6
1104.xlsx    9
1105.xlsx    3
            ..
7412.xlsx    6
7413.xlsx    5
7414.xlsx    6
7415.xlsx    3
7416.xlsx    4
Name: kpu, Length: 191, dtype: int64

In [7]:
predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file

y_pred = predictor.predict(x_test)

In [8]:
print("Predictions:  \n", y_pred)

Predictions:  
 id
1101.xlsx    7
1102.xlsx    6
1103.xlsx    6
1104.xlsx    9
1105.xlsx    2
            ..
7412.xlsx    6
7413.xlsx    4
7414.xlsx    2
7415.xlsx    0
7416.xlsx    4
Name: kpu, Length: 191, dtype: int64


In [9]:
perf = predictor.evaluate_predictions(y_true=y_true, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.6596858638743456
Evaluations on test data:
{
    "accuracy": 0.6596858638743456,
    "balanced_accuracy": 0.4947778362320405,
    "mcc": 0.6225067965529583
}


In [10]:
print(predictor.leaderboard(x_test_with_kpu, silent=True))

                  model  score_test  score_val  pred_time_test  pred_time_val  \
0         LightGBMLarge    0.659686   0.973684        0.009459       0.004690   
1               XGBoost    0.659686   0.947368        0.012483       0.005871   
2              LightGBM    0.659686   0.973684        0.017980       0.004917   
3   WeightedEnsemble_L2    0.659686   0.973684        0.022041       0.005327   
4      RandomForestGini    0.649215   0.736842        0.169326       0.095991   
5      RandomForestEntr    0.643979   0.710526        0.147065       0.098349   
6        ExtraTreesEntr    0.623037   0.605263        0.150057       0.094277   
7        ExtraTreesGini    0.617801   0.605263        0.165543       0.100993   
8              CatBoost    0.602094   0.921053        0.008906       0.005574   
9            LightGBMXT    0.602094   0.763158        0.020226       0.006083   
10       KNeighborsDist    0.549738   0.578947        0.014961       0.013037   
11      NeuralNetFastAI    0

In [11]:
leaderboard_hpo = predictor.leaderboard(silent=True)

best_model_name = leaderboard_hpo[leaderboard_hpo['stack_level'] == 1]['model'].iloc[0]

predictor_info = predictor.info()
best_model_info = predictor_info['model_info'][best_model_name]

In [12]:
print(best_model_info)

{'name': 'LightGBMLarge', 'model_type': 'LGBModel', 'problem_type': 'multiclass', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 0.9054126739501953, 'num_classes': 12, 'quantile_levels': None, 'predict_time': 0.0046901702880859375, 'val_score': 0.9736842105263158, 'hyperparameters': {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3}, 'hyperparameters_fit': {'num_boost_round': 13}, 'hyperparameters_nondefault': ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf'], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': ['bool', 'int', 'float', 'category'], 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None}, 'num_features': 20, 'features': ['sealed fissure', 'uncut tooth', 'fluorosis', '

In [13]:
print(f'Best Model Hyperparameters ({best_model_name}):')

Best Model Hyperparameters (LightGBMLarge):


In [14]:
print(best_model_info['hyperparameters'])

{'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3}
