In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

#baseline model
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [2]:
train_app_full = pd.read_csv('train/train_app_cleaned.csv')

In [3]:
train_user = pd.read_csv('train/train_user.csv')[['phone_no_m','label']]

In [4]:
train_app_full = pd.merge(train_app_full, train_user, on='phone_no_m', how='right')

In [5]:
train_app_full.head()

Unnamed: 0,phone_no_m,data_with_known_app_no,data_with_known_app_yes,network_usage,label
0,00073ceecc0f7220a440580ac5dea410c90d14b6669458...,0.137097,0.862903,31.427467,0
1,00086f1d2e2c1227f811c3e17f2e9c37cf9971f47bb933...,0.076923,0.923077,4.166953,1
2,000c00db8809c27e723ba90582bf334b2d3ca9063f53fb...,1.0,0.0,0.0,1
3,0014b698069503ceadb9442605834729064be51cdd7002...,0.2,0.8,0.298681,1
4,0034030646f2008d36431e00b133db30efc8b0c31757f3...,0.20875,0.79125,17.019287,0


In [6]:
train_app_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4884 entries, 0 to 4883
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   phone_no_m               4884 non-null   object 
 1   data_with_known_app_no   4884 non-null   float64
 2   data_with_known_app_yes  4884 non-null   float64
 3   network_usage            4884 non-null   float64
 4   label                    4884 non-null   int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 228.9+ KB


In [7]:
train_x, test_x, train_y, test_y = train_test_split(train_app_full.drop(['phone_no_m',
                                                        'label'],axis=1),
                                                   train_app_full['label'],
                                                   random_state=0, test_size=0.2,
                                                    stratify=train_app_full['label'])

In [8]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3907 entries, 3016 to 33
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   data_with_known_app_no   3907 non-null   float64
 1   data_with_known_app_yes  3907 non-null   float64
 2   network_usage            3907 non-null   float64
dtypes: float64(3)
memory usage: 122.1 KB


In [9]:
#we use robust scaling as network_usage is skewed
train_x.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
data_with_known_app_no,3907.0,0.266401,0.327295,0.0,0.055965,0.117107,0.277778,1.0
data_with_known_app_yes,3907.0,0.733599,0.327295,0.0,0.722222,0.882893,0.944035,1.0
network_usage,3907.0,25.937047,41.993301,0.0,0.092348,6.737151,35.169165,554.97145


### Baseline score

In [10]:
scalers = [StandardScaler(), RobustScaler(), MinMaxScaler()]

In [11]:
imputers = [SimpleImputer(strategy='mean'), KNNImputer(), SimpleImputer(strategy='median'),
           IterativeImputer(random_state=0,estimator=DecisionTreeRegressor(max_features='sqrt',random_state=0))]

In [12]:
preprocess = Pipeline([('scale',scalers[0]),('impute',imputers[0])])

In [13]:
models = [RandomForestClassifier(n_jobs=-1,random_state=0),
         LogisticRegression(n_jobs=-1,random_state=0),
         RidgeClassifier(random_state=0)]

In [14]:
model_pipeline = Pipeline([('preprocess',preprocess),('model',models[1])])

In [15]:
#baseline score
cross_val_score(model_pipeline, train_x, train_y, n_jobs=-1, cv=5, scoring='f1_macro').mean()

0.4716959741671441

### Best combination

In [16]:
params = dict(preprocess__scale=scalers, preprocess__impute=imputers, model=models)

In [17]:
# params = dict(preprocess__scale=scalers, preprocess__impute=imputers)

In [18]:
combi_search = GridSearchCV(model_pipeline,params,scoring='f1_macro',cv=5,n_jobs=-1,verbose=1)

In [19]:
combi_search.fit(train_x,train_y);

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.2s finished


In [20]:
results = pd.DataFrame(combi_search.cv_results_).sort_values(by='rank_test_score')

In [21]:
results[results['param_model'] == results['param_model'].unique()[0]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,0.594387,0.03794,0.124752,0.017617,"RandomForestClassifier(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,MinMaxScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.664533,0.720081,0.686205,0.739516,0.715857,0.705239,0.02656,1
2,0.406012,0.043584,0.106956,0.000875,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(),MinMaxScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.664533,0.720081,0.686205,0.739516,0.715857,0.705239,0.02656,1
8,0.667041,0.048495,0.129511,0.006795,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),MinMaxScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.664533,0.720081,0.686205,0.739516,0.715857,0.705239,0.02656,1
5,0.604524,0.032927,0.128411,0.019977,"RandomForestClassifier(n_jobs=-1, random_state=0)",KNNImputer(),MinMaxScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.664533,0.720081,0.686205,0.739516,0.715857,0.705239,0.02656,1
10,0.695245,0.066519,0.146172,0.030825,"RandomForestClassifier(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,RobustScaler(),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.66342,0.718211,0.689199,0.736539,0.712328,0.703939,0.025282,5


In [22]:
results[results['param_model'] == results['param_model'].unique()[1]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
24,0.010921,0.002673,0.004067,0.000613,RidgeClassifier(random_state=0),SimpleImputer(),StandardScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13
33,0.033697,0.001806,0.004103,0.000663,RidgeClassifier(random_state=0),IterativeImputer(estimator=DecisionTreeRegress...,StandardScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13
32,0.012116,0.001443,0.004604,0.000333,RidgeClassifier(random_state=0),SimpleImputer(strategy='median'),MinMaxScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13
31,0.012188,0.001139,0.004394,8.8e-05,RidgeClassifier(random_state=0),SimpleImputer(strategy='median'),RobustScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13
30,0.009294,0.000494,0.003577,0.000533,RidgeClassifier(random_state=0),SimpleImputer(strategy='median'),StandardScaler(),"{'model': RidgeClassifier(random_state=0), 'pr...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13


In [23]:
results[results['param_model'] == results['param_model'].unique()[2]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__impute,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
23,0.128244,0.002503,0.004823,0.000571,"LogisticRegression(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,MinMaxScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13
17,0.111806,0.003263,0.003684,0.00083,"LogisticRegression(n_jobs=-1, random_state=0)",KNNImputer(),MinMaxScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13
21,0.1291,0.001957,0.003524,0.000407,"LogisticRegression(n_jobs=-1, random_state=0)",IterativeImputer(estimator=DecisionTreeRegress...,StandardScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13
20,0.112061,0.002224,0.003944,0.000942,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),MinMaxScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13
19,0.112005,0.001854,0.004641,0.001057,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(strategy='median'),RobustScaler(),"{'model': LogisticRegression(n_jobs=-1, random...",0.56989,0.404417,0.574721,0.404726,0.404726,0.471696,0.082162,13


### Best way to handle train_app

scaler:
- standard/robust/MinMax -> uncertain

impute:
- mean/model -> uncertain

In [24]:
best_model = combi_search.best_estimator_

In [25]:
#baseline score: 0.4716959741671441
cross_val_score(best_model, train_x, train_y, n_jobs=-1, cv=5, scoring='f1_macro').mean()

0.7052386325245152

In [26]:
#when comparing with log model
print(0.4716959741671441 - 0.4716959741671441) #improve score by 0
print((0.4716959741671441 - 0.4716959741671441)/0.4716959741671441 * 100) #or 0% improvement

0.0
0.0


In [27]:
#when comparing with best model
print(0.7052386325245152 - 0.4716959741671441) #improve score by 0.234
print((0.7052386325245152 - 0.4716959741671441)/0.4716959741671441 * 100) #or 49.5% improvement

0.23354265835737115
49.51126809376075


### Final result

We will now use our validation set to estimate our model's performance in the real world.\
We will not make any changes to our model after this stage to avoid data leakage

In [28]:
prediction = best_model.predict(test_x)

In [29]:
f1_score(test_y, prediction, average='macro')

0.70180512760886

In [30]:
confusion_matrix(test_y, prediction)

array([[542, 122],
       [130, 183]])

In [31]:
print(classification_report(test_y, prediction))

              precision    recall  f1-score   support

           0       0.81      0.82      0.81       664
           1       0.60      0.58      0.59       313

    accuracy                           0.74       977
   macro avg       0.70      0.70      0.70       977
weighted avg       0.74      0.74      0.74       977

