Submittion with F1-Macro of 0.50507

In [None]:
import pycaret
from pycaret.classification import *

pycaret.__version__

In [None]:
import pandas as pd
import datetime
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('float_format', '{:f}'.format)

In [None]:
data_train = pd.read_parquet('train_v2.gzip')
data_test = pd.read_parquet('test_v2.gzip')
data_test_id = data_test[['ID']]

for col in ['CO_TIPO_SEXO', 'ANIO_BANCARIZACION', 'MES_BANCARIZACION']:
    data_train[col] = data_train[col].astype('category')
    data_test[col] = data_test[col].astype('category')

TARGET_VAR = 'TARGET'
N_FOLDS = 5

In [None]:
data_train.dtypes

In [None]:
# get only numeric columns
categorical_feats = ['CO_TIPO_SEXO', 'ANIO_BANCARIZACION', 'MES_BANCARIZACION']
data = data_train.select_dtypes(include=['int64', 'float64', 'category'])# pd.concat([data_train.select_dtypes(include=['int64', 'float64']), data_train['ID']], axis=1)
#data = data.drop(columns=['EDAD'])
exclude_substr_list = ['PROXY_', '_1', '_MA3', '_MA6', '_min', '_median', '_last']
#data = data.drop(columns=['CANT_EMP_NEG_max'])
data = data[[col for col in data.columns if all([sub not in col for sub in exclude_substr_list])]]

data_test = data_test.select_dtypes(include=['int64', 'float64', 'category'])
#data_test = data_test.drop(columns=['EDAD'])
#data_test = data_test.drop(columns=['CANT_EMP_NEG_max'])
data_test = data_test[[col for col in data_test.columns if all([sub not in col for sub in exclude_substr_list])]]

print(data.columns)

In [None]:
# perform min max normalization in all columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scaled = data.copy()

data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
data_scaled[TARGET_VAR] = data_train[TARGET_VAR]

In [None]:
data_scaled[TARGET_VAR].value_counts(normalize=True)

In [None]:
data_scaled[TARGET_VAR].value_counts(normalize=True)

In [None]:
s = setup(data_scaled, target=TARGET_VAR, session_id=123, use_gpu=True, fold=N_FOLDS, categorical_features=categorical_feats)

In [None]:
# add F1 macro to metrics
from sklearn.metrics import f1_score, make_scorer

try:
    def f1_macro(y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro')

    add_metric('f1_macro', 'F1 Macro', f1_macro, greater_is_better=True)
except:
    pass

In [None]:
get_metrics()

In [None]:
all_models = models()
all_models

In [None]:
best = compare_models(sort='F1 Macro', exclude=['knn', 'svm', 'rbfsvm', 'dummy', 'lda', 'lr', 'nb', 'ridge', 'qda', 'gpc', 'et'], fold=N_FOLDS)

In [None]:
# tuned_model = tune_model(best, optimize='F1 Macro')

In [None]:
plot_model(best, plot='confusion_matrix')

In [None]:
# plot_model(best, plot='learning')

In [None]:
plot_model(best, plot='auc')

In [None]:
try:
    plot_model(best, plot='feature_all')
except:
    pass

In [None]:
holdout_pred = predict_model(best)

In [None]:
predictions_train = predict_model(best, data=data_scaled)

In [None]:
# confusion matrix and classification report
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(predictions_train[TARGET_VAR], predictions_train['prediction_label'], normalize='true')
cm = pd.DataFrame(cm, columns=['0', '1'])
cm.index = ['0', '1']
display(cm)

print(classification_report(predictions_train[TARGET_VAR], predictions_train['prediction_label']))

In [None]:
predictions = predict_model(best, data=data_test.drop(columns=[TARGET_VAR]))

In [None]:
predictions['prediction_label'].value_counts(normalize=True)

In [None]:
now = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')

In [None]:
pd.concat([data_test_id, predictions[['prediction_label']]], axis=1).rename(columns={'prediction_label': TARGET_VAR}).to_csv(f'./submission_{now}.csv', index=False)  # , 'prediction_score'