In [1]:
import os

import numpy as np
import joblib

from utils.eval import evaluate
from utils.dataloader import DataLoader
from utils.metrics import calculate_custom_error


MODEL_EXPORT_PATH = 'models'
RESULTS_EXPORT_PATH = 'results'

# Data Loader

In [39]:

### Load training and test data
train_dataloader = DataLoader('data/training_data.csv', 
                            imputation_strategy='zero', 
                            split=True, test_size=0.2, 
                            type='train',
                            features='all',
                            drop_nan=0.1)
X_train, X_val, y_train, y_val = train_dataloader()
test_dataloader = DataLoader('data/test_data_no_target.csv', 
                            imputation_strategy='zero', 
                            split=False, 
                            type='test', 
                            features='all',
                            drop_nan=0.1)
X_test = test_dataloader()    

In [40]:
X_train1 = train_dataloader.filter_features(X_train, features='average')
X_val1 = train_dataloader.filter_features(X_val, features='average')
X_test1 = test_dataloader.filter_features(X_test, features='average')

In [41]:
X_train2 = train_dataloader.filter_features(X_train, features='1-year')
X_val2 = train_dataloader.filter_features(X_val, features='1-year')
X_test2 = test_dataloader.filter_features(X_test, features='1-year')

In [42]:
for column in X_train1.columns:
    if 'Group' not in column:
        X_train1[column] = X_train1[column] / X_train2['d' + column]
        X_val1[column] = X_val1[column] / X_val2['d' + column]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train1[column] = X_train1[column] / X_train2['d' + column]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val1[column] = X_val1[column] / X_val2['d' + column]


In [44]:
X_train = X_train1
X_val = X_val1

In [45]:
y_train = y_train + 1
y_val = y_val + 1

In [46]:
print('Training data shape:', X_train.shape, y_train.shape)
print('Validation data shape:', X_val.shape, y_val.shape)
print('Test data shape:', X_test.shape)

Training data shape: (6400, 66) (6400,)
Validation data shape: (1600, 66) (1600,)
Test data shape: (2000, 121)


# Model

In [47]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline

In [48]:
from sklearn.utils.class_weight import compute_sample_weight

# Initialize XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softmax', num_class=3, n_jobs=-1, verbosity=2, n_estimators=200, max_depth=5, learning_rate=0.01,)

# Define Grid Search Parameters
# param_grid = {
#     'xgb__n_estimators': [200],
#     'xgb__max_depth': [5, 10],
#     'xgb__learning_rate': [0.01]
# }

# # Perform feature selection and create multiple models
# pipeline = Pipeline([
#     ('xgb', xgb_model)
# ])

# grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=10, error_score='raise')
# import pdb; pdb.set_trace()
xgb_model.fit(X_train, y_train, sample_weight=compute_sample_weight("balanced", y_train))

In [49]:
# best_model = grid_search.best_estimator_
y_pred_val = xgb_model.predict(X_val)
evaluate(y_val, y_pred_val)

Accuracy: 0.385625
Precision: 0.4081809354928146
Recall: 0.385625
F1 Score: 0.3945678063040162
Confusion Matrix:
 [[243 136 240]
 [ 87  54  86]
 [289 145 320]]
Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.39      0.39       619
           1       0.16      0.24      0.19       227
           2       0.50      0.42      0.46       754

    accuracy                           0.39      1600
   macro avg       0.35      0.35      0.35      1600
weighted avg       0.41      0.39      0.39      1600

Custom Error: 0.945


: 

In [116]:
### Save model
# joblib.dump(xgb_model, f'{MODEL_EXPORT_PATH}/xgboost_model_average_features1.pkl')
# joblib.dump(xgb_model, f'{MODEL_EXPORT_PATH}/xgboost_model_1-year_features1.pkl')
joblib.dump(xgb_model, f'{MODEL_EXPORT_PATH}/xgboost_model_all_features1.pkl')

['models/xgboost_model_all_features1.pkl']

In [117]:
### Load model 
model1 = joblib.load(f'{MODEL_EXPORT_PATH}/xgboost_model_average_features1.pkl')
model2 = joblib.load(f'{MODEL_EXPORT_PATH}/xgboost_model_1-year_features1.pkl')
model3 = joblib.load(f'{MODEL_EXPORT_PATH}/xgboost_model_all_features1.pkl')

In [120]:
X_val1 = train_dataloader.filter_features(X_val, 'average')
X_val2 = train_dataloader.filter_features(X_val, '1-year')
weights = np.array([0.1, 0.1, 0.8])

y_pred_val1 = model1.predict_proba(X_val1)
y_pred_val2 = model2.predict_proba(X_val2)
y_pred_val3 = model3.predict_proba(X_val)

y_pred_val = (y_pred_val1 * weights[0] + y_pred_val2 * weights[1] + y_pred_val3 * weights[2])
# y_pred_val = (y_pred_val1 + y_pred_val2 + y_pred_val3) / 3
y_pred_val = np.argmax(y_pred_val, axis=1)

evaluate(y_val, y_pred_val)

Accuracy: 0.48625
Precision: 0.4826807436627463
Recall: 0.48625
F1 Score: 0.43200361203126475
Confusion Matrix:
 [[201   0 418]
 [ 59   1 167]
 [177   1 576]]
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.32      0.38       619
           1       0.50      0.00      0.01       227
           2       0.50      0.76      0.60       754

    accuracy                           0.49      1600
   macro avg       0.49      0.36      0.33      1600
weighted avg       0.48      0.49      0.43      1600

Custom Error: 0.885625


In [159]:
X_test1 = test_dataloader.filter_features(X_test, 'average')
X_test2 = test_dataloader.filter_features(X_test, '1-year')
X_test3 = test_dataloader.filter_features(X_test, 'all')

y_pred_test1 = model1.predict_proba(X_test1)
y_pred_test2 = model2.predict_proba(X_test2)
y_pred_test3 = model3.predict_proba(X_test3)

y_pred_test = (y_pred_test1 * weights[0] + y_pred_test2 * weights[1] + y_pred_test3 * weights[2])
# y_pred_test = (y_pred_test1 + y_pred_test2 + y_pred_test3) / 3
y_pred_test = np.argmax(y_pred_test, axis=1) - 1

np.savetxt(f'{RESULTS_EXPORT_PATH}/xgboost_ensemble.txt', y_pred_test, fmt='%d', newline='\n')