In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score

In [48]:
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)

In [49]:
X = train.drop(['EC1', 'EC2', 'EC3', 'EC4', 'EC5', 'EC6'], axis=1)
y = train[['EC1', 'EC2']]

In [50]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
params_EC1 ={'depth': 6,
             'l2_leaf_reg': 3,
             'border_count': 254,
             'verbose': False,
             'random_strength': 0.4,
             'task_type': 'CPU',
             'n_estimators': 230,
             'random_state': 8585,
             'eta': 0.01}

params_EC2 = {'depth': 8,
              'l2_leaf_reg': 9,
              'border_count': 254,
              'verbose': False,
              'random_strength': 0.6,
              'task_type': 'CPU',
              'n_estimators': 40,
              'random_state': 7555,
              'eta': 0.05}

In [64]:
model_EC1 = CatBoostClassifier(**params_EC1)
model_EC1.fit(X_train, y_train['EC1'])

model_EC2 = CatBoostClassifier(**params_EC2)
model_EC2.fit(X_train, y_train['EC2'])

<catboost.core.CatBoostClassifier at 0x7f8808128370>

In [20]:
# param_grid = {
#     'l2_leaf_reg': [10, 20, 30, 40]
# }

# grid_search_EC1 = GridSearchCV(model_EC1, param_grid, cv=3)
# grid_search_EC1.fit(X_train, y_train['EC1'])

In [21]:
# grid_search_EC2 = GridSearchCV(model_EC2, param_grid, cv=3)
# grid_search_EC2.fit(X_train, y_train['EC2'])

In [61]:
val_predictions_EC1 = model_EC1.predict(X_val)
cm_EC1 = confusion_matrix(y_val['EC1'], val_predictions_EC1)
print('Confusion matrix for EC1:')
print(cm_EC1)
f1_EC1 = f1_score(y_val['EC1'], val_predictions_EC1)
print('F1 score for EC1:', f1_EC1)
roc_auc_EC1 = roc_auc_score(y_val['EC1'], val_predictions_EC1)
print('ROC AUC for EC1:', roc_auc_EC1)

Confusion matrix for EC1:
[[ 349  627]
 [ 258 1734]]
F1 score for EC1: 0.7966919365954513
ROC AUC for EC1: 0.6140319474619791


In [77]:
val_prob_predictions_EC2 = model_EC2.predict_proba(X_val)
threshold = 0.72
val_predictions_EC2_adjusted = (val_prob_predictions_EC2[:, 1] > threshold).astype(int)

# Calculate the confusion matrix for EC2 with the adjusted predictions
cm_EC2_adjusted = confusion_matrix(y_val['EC2'], val_predictions_EC2_adjusted)
print('Adjusted confusion matrix for EC2:')
print(cm_EC2_adjusted)

# Calculate the F1 score for EC2 with the adjusted predictions
f1_EC2_adjusted = f1_score(y_val['EC2'], val_predictions_EC2_adjusted)
print('Adjusted F1 score for EC2:', f1_EC2_adjusted)

# Calculate the ROC AUC for EC2 with the adjusted predictions
roc_auc_EC2_adjusted = roc_auc_score(y_val['EC2'], val_predictions_EC2_adjusted)
print('Adjusted ROC AUC for EC2:', roc_auc_EC2_adjusted)

Adjusted confusion matrix for EC2:
[[  77  491]
 [ 196 2204]]
Adjusted F1 score for EC2: 0.865161923454367
Adjusted ROC AUC for EC2: 0.5269483568075117


In [72]:
test_prob_predictions_EC1 = model_EC1.predict_proba(test)
test_prob_predictions_EC2 = model_EC2.predict_proba(test)

In [73]:
test_prob_EC1 = test_prob_predictions_EC1[:, 1]
test_prob_EC2 = test_prob_predictions_EC2[:, 1]

In [74]:
submission = pd.DataFrame({'EC1': test_prob_EC1, 'EC2': test_prob_EC2})
submission.head()

Unnamed: 0,EC1,EC2
0,0.498002,0.788217
1,0.820426,0.78424
2,0.762792,0.737002
3,0.687319,0.788652
4,0.772684,0.729114


In [75]:
submission['id'] = test.index
submission = submission[['id', 'EC1', 'EC2']]
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,EC1,EC2
0,14838,0.498002,0.788217
1,14839,0.820426,0.78424
2,14840,0.762792,0.737002
3,14841,0.687319,0.788652
4,14842,0.772684,0.729114
