# Loading and preprocessing data

In [1]:
from pathlib import Path
import pandas as pd

data_dir = Path('./data')

X = pd.read_csv(data_dir / "x_train.txt", sep=' ', header=None)
y = pd.read_csv(data_dir / 'y_train.txt', sep=' ', header=None).values.ravel()

X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-2.619773,-2.619533,-1.19935,-1.083335,-1.00091,-0.366967,-2.164037,-1.210001,-0.658311,-1.489539,...,10.849925,10.343346,10.717519,7.709295,5.894554,12.416573,6.765269,16.243907,7.209524,8.082021
1,-1.415579,-1.782544,-2.88027,-1.958863,1.159968,0.27303,-1.628728,-0.175813,-0.916857,-0.570166,...,11.489417,5.195818,3.494627,5.529154,10.517576,15.697333,11.324938,12.18767,12.283861,5.032285
2,-2.745092,-1.382945,-1.626015,-1.28256,-0.663146,0.052349,-2.403322,-0.765073,-0.394354,-0.806624,...,13.934934,9.267515,4.705604,6.642557,14.658934,8.130767,7.194487,11.939354,11.65362,5.942778
3,0.618998,0.455364,-0.115081,0.64904,-0.862207,2.308504,0.526114,-1.094852,1.088656,-0.48121,...,12.021328,3.852231,11.059702,7.527268,7.25312,9.791136,6.089743,10.752796,5.778888,10.366363
4,-0.070694,-0.550509,-0.565556,-0.693065,-0.573089,-0.395862,0.00317,-0.981609,-0.505775,-0.75843,...,7.537788,11.229665,11.318915,6.622256,12.557882,5.52036,5.397359,13.152269,10.684779,9.816471


In [2]:
from src.utils import drop_highly_correlated_columns, calculate_score
from src.feature_selection_methods import *

from sklearn.preprocessing import MinMaxScaler

to_drop = drop_highly_correlated_columns(X, threshold=0.8)
X_dropped = X.drop(columns=to_drop)
X_dropped.head(3)

Unnamed: 0,0,2,10,11,12,13,14,15,16,17,...,490,491,492,493,494,495,496,497,498,499
0,-2.619773,-1.19935,-0.358917,-0.012789,-1.374178,0.426893,-0.669405,-0.436644,1.699974,0.183398,...,10.849925,10.343346,10.717519,7.709295,5.894554,12.416573,6.765269,16.243907,7.209524,8.082021
1,-1.415579,-2.88027,0.925066,1.206087,-1.515144,-0.951352,0.46528,0.223653,1.068241,-0.2106,...,11.489417,5.195818,3.494627,5.529154,10.517576,15.697333,11.324938,12.18767,12.283861,5.032285
2,-2.745092,-1.626015,-0.497576,-0.556523,-0.962981,-0.428344,1.289329,-0.408108,1.109381,-1.023611,...,13.934934,9.267515,4.705604,6.642557,14.658934,8.130767,7.194487,11.939354,11.65362,5.942778


In [3]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_dropped)
X = pd.DataFrame(X_scaled, columns=X_dropped.columns)
X.shape

(5000, 492)

In [4]:
X.head(3)

Unnamed: 0,0,2,10,11,12,13,14,15,16,17,...,490,491,492,493,494,495,496,497,498,499
0,0.318998,0.452487,0.461214,0.463765,0.30508,0.549409,0.365262,0.444646,0.668241,0.552953,...,0.300022,0.269133,0.271933,0.217244,0.130859,0.380061,0.155309,0.503364,0.187288,0.236775
1,0.401994,0.332835,0.638022,0.635842,0.284538,0.355005,0.522587,0.542695,0.595727,0.498273,...,0.320193,0.116914,0.074722,0.144473,0.250728,0.485963,0.278602,0.367001,0.349922,0.140597
2,0.310361,0.422116,0.44212,0.387002,0.365002,0.428776,0.636842,0.448883,0.600449,0.385441,...,0.397331,0.237319,0.107786,0.181637,0.358109,0.241718,0.166915,0.358653,0.329722,0.169311


# Hyperparameter Tuning for Random Forest

In [5]:
param_grid = {
    'n_estimators': [90, 100, 110, 125, 140, 150, 160, 175, 200, 210, 225, 250],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced_subsample', 'balanced', None],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 10],
    'min_samples_leaf': [1, 2, 4],
}

In [6]:
X_selected_indices = rf_feature_importance_selection(X, y, 6, True)
X_selected = X.iloc[:, X_selected_indices]

In [7]:
X_selected.head(2)

Unnamed: 0,102,105,103,100,101,104
0,0.52448,0.641478,0.478201,0.352511,0.59011,0.432938
1,0.639303,0.336821,0.752457,0.274799,0.563215,0.459157


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from time import time

custom_scorer = make_scorer(calculate_score, greater_is_better=True)

start_time = time()
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring=custom_scorer, verbose=1)
grid_search.fit(X_selected, y)
fit_time = time() - start_time
print(f'\nGrid search took: {fit_time:.2f}\n\n')
grid_search.best_params_, grid_search.best_score_

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits

Grid search took: 13678.88




({'class_weight': 'balanced',
  'max_features': 'sqrt',
  'min_samples_leaf': 1,
  'min_samples_split': 10,
  'n_estimators': 100},
 7323.727454909819)

In [8]:
7323.727454909819 - 200*6

6123.727454909819

Our best score after performing grid search is 6123.73.

#### saving best model and indices

In [None]:
import pickle
import os
import numpy as np

best_rf = grid_search.best_estimator_

save_path = 'tuning_results'
if not os.path.exists(save_path):
    os.mkdir(save_path)

with open(f'{save_path}/best_rf_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)

np.save(f'{save_path}/selected_feature_indices.npy', X_selected_indices)

#### loading best model and indices

In [20]:
with open(f'{save_path}/best_rf_model.pkl', 'rb') as model_file:
    best_rf_loaded = pickle.load(model_file)

X_selected_indices_loaded = np.load(f'{save_path}/selected_feature_indices.npy')
X_selected_reproduced = X[:, X_selected_indices_loaded]

# Hyperparameter Tuning for SVC

In [9]:
param_grid_svc = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
}

In [6]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from time import time

custom_scorer = make_scorer(calculate_score, greater_is_better=True)

start_time = time()
grid_search_svc = GridSearchCV(SVC(random_state=42), param_grid_svc, cv=5, scoring=custom_scorer, verbose=2)
grid_search_svc.fit(X_selected, y)
fit_time = time() - start_time

print(f'\nGrid search took: {fit_time:.2f} seconds\n')
print('Best parameters:', grid_search_svc.best_params_)
print('Best score:', grid_search_svc.best_score_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   1.3s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   1.2s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   1.3s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   1.2s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   1.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.5s
[CV] END .....................C=0.1, gamma=scal

In [10]:
7283.58 - 6*200

6083.58

For the SVM it is 6083.58, thus we decided to stay with the $\texttt{RandomForest}$.

#### saving best model and indices

In [9]:
import pickle
import os
import numpy as np

best_svc = grid_search_svc.best_estimator_

save_path = 'tuning_results'
if not os.path.exists(save_path):
    os.mkdir(save_path)

with open(f'{save_path}/best_svc_model.pkl', 'wb') as model_file:
    pickle.dump(best_svc, model_file)

#### loading best model and indices

In [10]:
Load the best model
with open(f'{save_path}/best_svc_model.pkl', 'rb') as model_file:
    best_svc_loaded = pickle.load(model_file)

# Best model calibration + test data prediction

In [11]:
X_test = pd.read_csv(data_dir / 'x_test.txt', sep=' ', header=None)
X_test = X_test.drop(columns=to_drop)
X_test.shape

(5000, 492)

In [12]:
X_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(X_test_scaled, columns=X_dropped.columns)
X_test_selected = X_test.iloc[:, X_selected_indices]
X_test_selected.head(2)

Unnamed: 0,102,105,103,100,101,104
0,0.402212,0.480637,0.531483,0.661018,0.490534,0.448892
1,0.458229,0.749308,0.49502,0.608295,0.361313,0.674319


We calibrate the model to have probabilities that are reliably representative of the true probabilities.

In [17]:
from sklearn.calibration import CalibratedClassifierCV

# best_rf = grid_search.best_estimator_

with open(f'{save_path}/best_rf_model.pkl', 'rb') as model_file:
    best_rf = pickle.load(model_file)

best_rf.fit(X_selected, y)
calibration = CalibratedClassifierCV(best_rf, cv='prefit')
calibration.fit(X_selected, y)
results = calibration.predict_proba(X_test_selected)

--------------------------------------------------------------------------------

In [18]:
results = pd.DataFrame(results)
results.reset_index(inplace=True)
results.sort_values(by=1, ascending=False, inplace=True)

In [19]:
def set_first_1000_rows(df, column_name):
    df[column_name] = 0
    col_index = df.columns.get_loc(column_name)
    df.iloc[:1000, col_index] = 1
    return df

results = set_first_1000_rows(results, 'prediction')
results.prediction.value_counts()

prediction
0    4000
1    1000
Name: count, dtype: int64

In [20]:
clients_to_offer = sorted(results.loc[results.prediction == 1, :].index)
clients_to_offer = [idx + 1 for idx in clients_to_offer]
print(clients_to_offer[:10])

[2, 4, 6, 12, 19, 33, 39, 44, 57, 61]


In [21]:
with open("313342_obs.txt", "w") as file:
    for item in clients_to_offer:
        file.write(f"{item}\n")

In [22]:
feature_indexes = [idx + 1 for idx in X_selected.columns] # taking columns as indicies aren't valid (because they are after removing correlated columns)
feature_indexes

[103, 106, 104, 101, 102, 105]

In [23]:
with open("313342_vars.txt", "w") as file:
    for item in feature_indexes:
        file.write(f"{item}\n")