In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

#baseline model
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [2]:
train_sms_full = pd.read_csv('train/train_sms_cleaned.csv')

In [3]:
train_user = pd.read_csv('train/train_user.csv')[['phone_no_m','label']]

In [4]:
train_sms_full = pd.merge(train_sms_full, train_user, on='phone_no_m', how='right')

In [5]:
train_sms_full.head()

Unnamed: 0,phone_no_m,sms_per_receiver,sms_calltype_1,sms_calltype_2,label
0,00073ceecc0f7220a440580ac5dea410c90d14b6669458...,8.147541,0.006036,0.993964,0
1,00086f1d2e2c1227f811c3e17f2e9c37cf9971f47bb933...,16.205882,0.018149,0.981851,1
2,000c00db8809c27e723ba90582bf334b2d3ca9063f53fb...,3.129032,0.164948,0.835052,1
3,0014b698069503ceadb9442605834729064be51cdd7002...,4.785714,0.029851,0.970149,1
4,0034030646f2008d36431e00b133db30efc8b0c31757f3...,5.835443,0.012292,0.987708,0


In [6]:
#we need to handle missing values
train_sms_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4884 entries, 0 to 4883
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   phone_no_m        4884 non-null   object 
 1   sms_per_receiver  4882 non-null   float64
 2   sms_calltype_1    4882 non-null   float64
 3   sms_calltype_2    4882 non-null   float64
 4   label             4884 non-null   int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 228.9+ KB


In [7]:
train_x, test_x, train_y, test_y = train_test_split(train_sms_full.drop(['phone_no_m',
                                                        'label'],axis=1),
                                                   train_sms_full['label'],
                                                   random_state=0, test_size=0.2,
                                                    stratify=train_sms_full['label'])

In [8]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3907 entries, 3015 to 33
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sms_per_receiver  3905 non-null   float64
 1   sms_calltype_1    3905 non-null   float64
 2   sms_calltype_2    3905 non-null   float64
dtypes: float64(3)
memory usage: 122.1 KB


In [9]:
#we use robust scaling as sms_per_receiver is skewed
train_x.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sms_per_receiver,3905.0,11.782607,12.6478,1.0,5.460259,8.230769,13.78125,358.307692
sms_calltype_1,3905.0,0.05053,0.122577,0.0,0.0025,0.010799,0.040892,0.988199
sms_calltype_2,3905.0,0.94947,0.122577,0.011801,0.959108,0.989201,0.9975,1.0


### Baseline score

In [10]:
scalers = [StandardScaler(), RobustScaler(), MinMaxScaler()]

In [11]:
imputers = [SimpleImputer(strategy='mean'), KNNImputer(), SimpleImputer(strategy='median'),
           IterativeImputer(random_state=0,estimator=DecisionTreeRegressor(max_features='sqrt',random_state=0))]

In [12]:
preprocess = Pipeline([('scale',scalers[0]),('impute',imputers[0])])

In [13]:
models = [RandomForestClassifier(n_jobs=-1,random_state=0),
         LogisticRegression(n_jobs=-1,random_state=0),
         RidgeClassifier(random_state=0)]

In [14]:
model_pipeline = Pipeline([('preprocess',preprocess),('model',models[1])])

In [15]:
#baseline score
cross_val_score(model_pipeline, train_x, train_y, n_jobs=-1, cv=5, scoring='f1_macro').mean()

0.6518376970945112

### Best combination

In [16]:
params = dict(preprocess__scale=scalers, preprocess__impute=imputers, model=models)

In [17]:
#to compare with base model
# params = dict(preprocess__scale=scalers, preprocess__impute=imputers)

In [18]:
combi_search = GridSearchCV(model_pipeline,params,scoring='f1_macro',cv=5,n_jobs=-1,verbose=1)

In [19]:
combi_search.fit(train_x,train_y);

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 165 out of 180 | elapsed:    7.5s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.6s finished


In [20]:
results = pd.DataFrame(combi_search.cv_results_).sort_values(by='rank_test_score')

In [21]:
results[results['param_model'] == results['param_model'].unique()[0]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,0.797063,0.134822,0.10924,0.001244,"RandomForestClassifier(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,MinMaxScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.707851,0.67523,0.667618,0.721348,0.665092,0.687428,0.022838,1
9,0.774402,0.13376,0.1617,0.037274,"RandomForestClassifier(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,StandardScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.702831,0.67523,0.667618,0.723192,0.665092,0.686793,0.022603,2
10,0.733758,0.089084,0.129914,0.013935,"RandomForestClassifier(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,RobustScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.702831,0.67523,0.667618,0.721348,0.665092,0.686424,0.022013,3
6,0.564546,0.031589,0.191612,0.047259,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),StandardScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.696213,0.67565,0.669068,0.721864,0.666174,0.685794,0.020867,4
0,0.529183,0.045261,0.111202,0.005423,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(),StandardScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.705163,0.666763,0.672537,0.72119,0.662115,0.685554,0.023351,5


In [22]:
results[results['param_model'] == results['param_model'].unique()[1]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,0.11575,0.002045,0.003923,0.000776,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.639625,0.629735,0.648735,0.681027,0.665053,0.652835,0.01827,13
22,0.15024,0.005617,0.004419,0.001272,"LogisticRegression(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.639625,0.629735,0.648735,0.681027,0.665053,0.652835,0.01827,13
16,0.111634,0.001679,0.003795,0.001048,"LogisticRegression(n_jobs=-1, random_state=0)",KNNImputer(),RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.639625,0.629735,0.648735,0.681027,0.665053,0.652835,0.01827,13
13,0.173569,0.103223,0.004554,0.00086,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(),RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.639625,0.629735,0.648735,0.681027,0.665053,0.652835,0.01827,13
18,0.11207,0.003328,0.003515,0.000484,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),StandardScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.639625,0.629735,0.646247,0.681027,0.665053,0.652337,0.018408,17


In [23]:
results[results['param_model'] == results['param_model'].unique()[2]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
24,0.007621,0.001115,0.002737,0.000151,RidgeClassifier(random_state=0),SimpleImputer(),StandardScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.53098,0.502847,0.523199,0.544192,0.535262,0.527296,0.013979,25
25,0.008832,0.001137,0.003131,0.000339,RidgeClassifier(random_state=0),SimpleImputer(),RobustScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.53098,0.502847,0.523199,0.544192,0.535262,0.527296,0.013979,25
27,0.009016,0.001089,0.003922,0.001246,RidgeClassifier(random_state=0),KNNImputer(),StandardScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.53098,0.502847,0.523199,0.544192,0.535262,0.527296,0.013979,25
28,0.011666,0.001222,0.004665,0.001146,RidgeClassifier(random_state=0),KNNImputer(),RobustScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.53098,0.502847,0.523199,0.544192,0.535262,0.527296,0.013979,25
30,0.011463,0.001025,0.004168,0.000638,RidgeClassifier(random_state=0),SimpleImputer(strategy='median'),StandardScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.53098,0.502847,0.523199,0.544192,0.535262,0.527296,0.013979,25


### Best way to handle train_sms

scaler:
- robust/MinMax -> uncertain

impute:
- mean/median/model -> uncertain

In [24]:
best_model = combi_search.best_estimator_

In [25]:
#baseline score: 0.6518376970945112
cross_val_score(best_model, train_x, train_y, n_jobs=-1, cv=5, scoring='f1_macro').mean()

0.6874279062265012

In [26]:
#when comparing with log model
print(0.6528349861461178 - 0.6518376970945112) #improve score by 0.001
print((0.6528349861461178 - 0.6518376970945112)/0.6518376970945112 * 100) #or 0.153% improvement

0.0009972890516065647
0.15299652905191916


In [32]:
#when comparing with best model
print(0.6874279062265012 - 0.6518376970945112) #improve score by 0.03
print((0.6874279062265012 - 0.6518376970945112)/0.6518376970945112 * 100) #or 5.5% improvement

0.03559020913198996
5.459980189950516


### Final result

We will now use our validation set to estimate our model's performance in the real world.\
We will not make any changes to our model after this stage to avoid data leakage

In [28]:
prediction = best_model.predict(test_x)

In [29]:
f1_score(test_y, prediction, average='macro')

0.7050865834200912

In [30]:
confusion_matrix(test_y, prediction)

array([[555, 109],
       [136, 177]])

In [31]:
print(classification_report(test_y, prediction))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       664
           1       0.62      0.57      0.59       313

    accuracy                           0.75       977
   macro avg       0.71      0.70      0.71       977
weighted avg       0.74      0.75      0.75       977

