In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

#baseline model
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [2]:
train_user = pd.read_csv('train/train_user.csv')[['phone_no_m','label']]

In [3]:
train_voc_full = pd.read_csv('train/train_voc_cleaned.csv')

In [4]:
train_voc_full = pd.merge(train_voc_full,train_user,on='phone_no_m',how='outer')

In [5]:
train_x, test_x, train_y, test_y = train_test_split(train_voc_full.drop(['phone_no_m','label'],axis=1), 
                                                    train_voc_full['label'],
                                                    random_state=0, stratify=train_voc_full['label'])

In [6]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3663 entries, 610 to 32
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   call_dur                   3613 non-null   float64
 1   voc_opposite_no_m          3613 non-null   float64
 2   voc_calltype_1             3613 non-null   float64
 3   voc_calltype_2             3613 non-null   float64
 4   voc_calltype_3             3613 non-null   float64
 5   voc_receive_unique_county  3613 non-null   float64
dtypes: float64(6)
memory usage: 200.3 KB


In [7]:
train_x.head()

Unnamed: 0,call_dur,voc_opposite_no_m,voc_calltype_1,voc_calltype_2,voc_calltype_3,voc_receive_unique_county
610,43.168067,15.0,0.235294,0.764706,0.0,2.0
1260,63.925532,76.0,0.978723,0.021277,0.0,0.0
3252,100.036364,109.0,0.990909,0.009091,0.0,4.0
3413,102.987871,274.0,0.322102,0.677898,0.0,11.0
601,100.127056,198.0,0.569484,0.430516,0.0,16.0


In [8]:
train_x.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
call_dur,3613.0,104.476081,88.923224,2.0,55.209209,80.502879,124.205882,1360.0
voc_opposite_no_m,3613.0,234.257681,799.041788,1.0,32.0,77.0,182.0,13746.0
voc_calltype_1,3613.0,0.645566,0.275716,0.0,0.442029,0.592105,0.96748,1.0
voc_calltype_2,3613.0,0.353311,0.275258,0.0,0.032258,0.407255,0.556838,1.0
voc_calltype_3,3613.0,0.001123,0.013184,0.0,0.0,0.0,0.0,0.476923
voc_receive_unique_county,3613.0,5.318018,6.566965,0.0,1.0,3.0,7.0,61.0


## Baseline score

In [9]:
scalers = [StandardScaler(), RobustScaler(), MinMaxScaler()]

In [10]:
imputers = [SimpleImputer(strategy='mean'), KNNImputer(), SimpleImputer(strategy='median'),
           IterativeImputer(random_state=0,estimator=DecisionTreeRegressor(max_features='sqrt',random_state=0))]

In [11]:
preprocess = Pipeline([('scale',scalers[0]),('impute',imputers[0])])

In [12]:
models = [RandomForestClassifier(n_jobs=-1,random_state=0),
         LogisticRegression(n_jobs=-1,random_state=0),
         RidgeClassifier(random_state=0)]

In [48]:
model_pipeline = Pipeline([('preprocess',preprocess),('model',models[1])])

In [49]:
#baseline score
cross_val_score(model_pipeline, train_x, train_y, n_jobs=-1, cv=4, scoring='f1_macro').mean()

0.7881887106438912

## Best combination

In [17]:
#model_pipe.get_params()

In [57]:
params = dict(preprocess__scale=scalers, preprocess__impute=imputers, model=models)

In [51]:
#to compare with base model
# params = dict(preprocess__scale=scalers, preprocess__impute=imputers)

In [58]:
combi_search = GridSearchCV(model_pipeline,params,scoring='f1_macro',cv=5,n_jobs=-1,verbose=1)

In [59]:
combi_search.fit(train_x,train_y);

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 165 out of 180 | elapsed:    8.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.8s finished


In [42]:
results = pd.DataFrame(combi_search.cv_results_).sort_values(by='rank_test_score')

In [43]:
results[results['param_model'] == results['param_model'].unique()[0]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,1.053278,0.392209,0.119317,0.004383,"RandomForestClassifier(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,RobustScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.839293,0.827314,0.848002,0.828661,0.854324,0.839519,0.010565,1
6,0.648087,0.008259,0.193847,0.016094,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),StandardScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.839293,0.823928,0.846571,0.823307,0.856179,0.837855,0.012801,2
8,0.573837,0.029793,0.110426,0.003303,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),MinMaxScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.839293,0.823928,0.846571,0.824742,0.854725,0.837852,0.012071,3
7,0.597575,0.094106,0.142035,0.059319,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),RobustScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.839293,0.8225,0.845143,0.825268,0.856179,0.837676,0.01253,4
11,1.055159,0.261524,0.12133,0.006833,"RandomForestClassifier(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,MinMaxScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.836882,0.8225,0.845548,0.823835,0.857635,0.83728,0.013282,5


In [44]:
results[results['param_model'] == results['param_model'].unique()[1]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,0.1156,0.003081,0.003093,6.9e-05,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.777134,0.805832,0.780367,0.795947,0.807115,0.793279,0.012518,13
13,0.129317,0.009933,0.026999,0.020058,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(),RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.77583,0.803023,0.780628,0.794732,0.806585,0.79216,0.012103,14
16,0.129478,0.002664,0.011666,0.003921,"LogisticRegression(n_jobs=-1, random_state=0)",KNNImputer(),RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.77583,0.803023,0.780628,0.794732,0.806585,0.79216,0.012103,14
22,0.365264,0.147307,0.010979,0.004606,"LogisticRegression(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.775133,0.803612,0.770594,0.800712,0.809891,0.791989,0.01596,16
15,0.117876,0.001923,0.012848,0.001714,"LogisticRegression(n_jobs=-1, random_state=0)",KNNImputer(),StandardScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.774476,0.804426,0.77797,0.791801,0.805731,0.790881,0.012966,17


In [45]:
results[results['param_model'] == results['param_model'].unique()[2]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
31,0.018946,0.003686,0.005624,0.001129,RidgeClassifier(random_state=0),SimpleImputer(strategy='median'),RobustScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.757075,0.759003,0.748123,0.748381,0.790289,0.760574,0.015501,21
34,0.276534,0.128606,0.010448,0.001683,RidgeClassifier(random_state=0),IterativeImputer(estimator=DecisionTreeRegress...,RobustScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.755769,0.759003,0.746326,0.748381,0.790289,0.759954,0.015864,23
25,0.012558,0.001458,0.004485,0.000121,RidgeClassifier(random_state=0),SimpleImputer(),RobustScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.755769,0.75514,0.748123,0.750273,0.788951,0.759651,0.014931,24
28,0.025776,0.000967,0.008997,0.000703,RidgeClassifier(random_state=0),KNNImputer(),RobustScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.755769,0.75514,0.748123,0.750273,0.788951,0.759651,0.014931,24
26,0.012185,0.001014,0.005058,0.000345,RidgeClassifier(random_state=0),SimpleImputer(),MinMaxScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.759617,0.760314,0.746326,0.743911,0.787615,0.759556,0.015541,26


## Best way to handle train_voc_cleaned

scaler:

- robust scale → confirm

imputer:

- median or decision tree model → uncertain

In [54]:
best_model = combi_search.best_estimator_

In [55]:
#baseline score: 0.7881887106438912
cross_val_score(best_model, train_x, train_y, n_jobs=-1, cv=4, scoring='f1_macro').mean()

0.7914336263088502

In [56]:
#when comparing with log model
print(0.7914336263088502 - 0.7881887106438912) #improve score by 0.003
print((0.7914336263088502 - 0.7881887106438912)/0.7881887106438912 * 100) #or 0.412% improvement

0.0032449156649589916
0.4116927356531329


In [50]:
#when comparing with best model
print(0.8381347093143038 - 0.7881887106438912) #improve score by 0.05
print((0.8381347093143038 - 0.7881887106438912)/0.7881887106438912 * 100) #or 6.34% improvement

0.049945998670412606
6.336807162539852


# Final result

We will now use our validation set to estimate our model's performance in the real world.\
We will not make any changes to our model after this stage to avoid data leakage

In [60]:
prediction = best_model.predict(test_x)

In [61]:
f1_score(test_y, prediction, average='macro')

0.80322480067703

In [62]:
confusion_matrix(test_y, prediction)

array([[764,  66],
       [133, 258]])

In [63]:
print(classification_report(test_y, prediction))

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       830
           1       0.80      0.66      0.72       391

    accuracy                           0.84      1221
   macro avg       0.82      0.79      0.80      1221
weighted avg       0.83      0.84      0.83      1221

