In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [47]:
#preprocessing
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

#baseline model
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [3]:
train_user_original = pd.read_csv('train/train_user.csv')

In [4]:
train_user, test_user, _, __ = train_test_split(train_user_original,
                                                np.arange(train_user_original.shape[0]),
                                                random_state=0,test_size=0.2,stratify=train_user_original['label'])

In [5]:
train_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3907 entries, 3091 to 32
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   phone_no_m   3907 non-null   object 
 1   city_name    3718 non-null   object 
 2   county_name  3718 non-null   object 
 3   idcard_cnt   3907 non-null   int64  
 4   arpu_201908  2751 non-null   float64
 5   arpu_201909  2850 non-null   float64
 6   arpu_201910  2982 non-null   float64
 7   arpu_201911  3244 non-null   float64
 8   arpu_201912  3358 non-null   float64
 9   arpu_202001  3286 non-null   float64
 10  arpu_202002  3301 non-null   float64
 11  arpu_202003  3443 non-null   float64
 12  label        3907 non-null   int64  
dtypes: float64(8), int64(2), object(3)
memory usage: 427.3+ KB


In [6]:
train_user.head()

Unnamed: 0,phone_no_m,city_name,county_name,idcard_cnt,arpu_201908,arpu_201909,arpu_201910,arpu_201911,arpu_201912,arpu_202001,arpu_202002,arpu_202003,label
3091,11a1f8d7ba49db1780b9c5636673c7aaad2f193e16ba48...,天府新区,双流分公司,4,,,3.36,19.2,19.0,19.1,19.0,61.88,1
3342,53bff895930f4b7ae7e0daf3ab38f68b47d24af6780eba...,南充,顺庆分公司,1,,,,,,,,2.18,0
2740,7335a06204959c3b18c70069e0b91a791d1f9ccf475ed5...,成都,都江堰分公司,1,30.68,27.44,117.08,47.6,33.2,30.86,23.12,35.18,0
1826,b01f24a1a27204dd81ae0ba1b44044844c2cfe6816c409...,成都,温江分公司,4,,,,50.47,19.0,19.0,19.0,19.0,1
4536,b3d3519ba894c9230b023c716bea8d20b0b1cc4a0b05f9...,绵阳,北川分公司,1,28.9,21.12,21.0,28.12,21.0,23.84,21.0,27.41,0


## county_name

In [7]:
county_name = train_user['county_name']

In [8]:
county_name.describe()

count      3718
unique      177
top       武侯分公司
freq        172
Name: county_name, dtype: object

#### Missing values

In [9]:
#we need to handle the missing values
county_name.isna().sum()

189

In [10]:
#clearly those without a county_names are more likely to be spam calls
train_user.loc[train_user['county_name'].isna(),['county_name','label','phone_no_m']].groupby(['label']).count().\
                sort_values('phone_no_m',ascending=False).head(15)

Unnamed: 0_level_0,county_name,phone_no_m
label,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,153
0,0,36


In [11]:
#the most frequent value: 武侯分公司 spreads across spam and no spam call
#worry is this might mean that the na value filled will lose the preditive power
#when we clearly see that those without a county label are more likely to be spam calls
train_user[['county_name','label','phone_no_m']].groupby(['label','county_name']).count().\
                sort_values('phone_no_m',ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,phone_no_m
label,county_name,Unnamed: 2_level_1
0,成都直属部门,96
0,天府直属部门,93
0,武侯分公司,88
1,武侯分公司,84
1,锦江分公司,72
0,锦江分公司,62
0,青羊分公司,62
0,金牛分公司,56
0,成华分公司,56
0,郫都分公司,51


#### Encoding

In [12]:
#some counties dominates the source of the phone number, while others occupy very little percentage
#since there is a total of 180 counties, we might want to create a group that doesn't have spam calls so far
county_name.value_counts()/county_name.count()*100

武侯分公司       4.626143
天府直属部门      3.738569
锦江分公司       3.604088
成都直属部门      3.577192
青羊分公司       3.012372
              ...   
眉山直属部门      0.026896
长宁县分公司      0.026896
普通及零售分销商    0.026896
九寨沟县分公司     0.026896
马尔康县分公司     0.026896
Name: county_name, Length: 177, dtype: float64

In [13]:
#nearly 30 percent of the counties do not have a past record of spam calls
county_label = train_user[['county_name','label']].groupby('county_name').sum().\
                            sort_values(by='label',ascending=False).reset_index()
np.sum(county_label['label'] == 0) / county_label['county_name'].count() * 100

28.24858757062147

## idcard_cnt

In [14]:
#lucky for us, there is no na for us to handle here
#since it's a ordered data (10 id counts > 1 id count), we will use label encoder
print(train_user['idcard_cnt'].isna().sum())
train_user['idcard_cnt'].value_counts()

0


1     1770
2      961
3      532
4      317
5      284
0       30
6       10
10       2
13       1
Name: idcard_cnt, dtype: int64

In [15]:
#we see that different idcard_cnt has very diff probability of spam call
train_user[['idcard_cnt','label']].groupby(['idcard_cnt']).sum()/\
        train_user[['idcard_cnt','label']].groupby(['idcard_cnt']).count()

Unnamed: 0_level_0,label
idcard_cnt,Unnamed: 1_level_1
0,0.9
1,0.176836
2,0.261186
3,0.413534
4,0.59306
5,0.852113
6,0.8
10,1.0
13,1.0


### arpu_20xxxx:

In [16]:
#in average, spam callers spend much more than normal users
train_user[['arpu_201909', 'arpu_201910', 'arpu_201911', 'arpu_201912',
       'arpu_202001', 'arpu_202002', 'arpu_202003', 'label']].groupby('label').sum()/train_user[['arpu_201909', 'arpu_201910', 'arpu_201911', 'arpu_201912',
       'arpu_202001', 'arpu_202002', 'arpu_202003', 'label']].groupby('label').count()

Unnamed: 0_level_0,arpu_201909,arpu_201910,arpu_201911,arpu_201912,arpu_202001,arpu_202002,arpu_202003
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,57.387099,57.707644,57.130558,57.516684,56.712439,54.416695,55.236554
1,63.778444,63.391505,85.147719,108.59674,72.672601,80.063333,91.811764


In [17]:
#and we observe that spam callers always spend much higher than the average amount of all users
#we can consider to recode the amount spend as a binary variable: higher or lower than mean
train_user[['arpu_201909', 'arpu_201910', 'arpu_201911', 'arpu_201912',
       'arpu_202001', 'arpu_202002', 'arpu_202003']].mean()

arpu_201909    58.367105
arpu_201910    58.771224
arpu_201911    63.944843
arpu_201912    69.974863
arpu_202001    60.185207
arpu_202002    59.940706
arpu_202003    63.607528
dtype: float64

In [18]:
#we need to handle the missing values
train_user[['arpu_201909', 'arpu_201910', 'arpu_201911', 'arpu_201912',
       'arpu_202001', 'arpu_202002', 'arpu_202003']].isna().sum()

arpu_201909    1057
arpu_201910     925
arpu_201911     663
arpu_201912     549
arpu_202001     621
arpu_202002     606
arpu_202003     464
dtype: int64

In [19]:
#we see that we are lacking more of the spending amt by spam callers
#therefore, we can't simply impute average, which will disrupte our prediction
#let us use the county mean to impute the average, which is correlated with the spam call
train_user[['arpu_201909', 'arpu_201910', 'arpu_201911', 'arpu_201912',
       'arpu_202001', 'arpu_202002', 'arpu_202003','label']].groupby('label').count() - train_user.shape[0]

Unnamed: 0_level_0,arpu_201909,arpu_201910,arpu_201911,arpu_201912,arpu_202001,arpu_202002,arpu_202003
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-1494,-1483,-1452,-1368,-1336,-1317,-1252
1,-3470,-3349,-3118,-3088,-3192,-3196,-3119


In [20]:
missing_arpu = train_user.drop(['city_name','idcard_cnt','phone_no_m'],axis=1).groupby(['label','county_name']).count()\
            -np.repeat(train_user[['county_name','phone_no_m','label']].groupby(['label','county_name']).\
                        count().values,8,axis=1)

In [21]:
missing_arpu.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,arpu_201908,arpu_201909,arpu_201910,arpu_201911,arpu_201912,arpu_202001,arpu_202002,arpu_202003
label,county_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,万源市分公司,-1,-1,-1,-1,0,0,0,0
0,三台分公司,0,0,0,0,0,0,0,0
0,东兴区分公司,-2,-2,-2,-2,-1,-1,-1,0
0,东区分公司,-3,-3,-3,-2,-1,-1,0,0
0,东坡区分公司,-5,-5,-5,-5,-2,-2,-2,0


# Building baseline

Here are the preprocessing and feature engineering steps we'll take for train_user.csv

city_name:

- drop

county_name:

- fillna with:

    1. no_record (preferred)

    2. most frequent county

- encoding:

    1. OHE

    2. Group county into individual county and no past record of spam call county

idcard_cnt:

- encoding
    1. label encoding: nothing to be done

arpu_20xxxx:

- na: impute by model

- Feature engineering:

    1. Raw numbers
    2. binary variable: higher or lower than mean

In [22]:
train_x, train_y = train_user.drop(['phone_no_m','label'],axis=1), train_user['label']
test_x, test_y = test_user.drop(['phone_no_m','label'],axis=1), test_user['label']

In [23]:
#city_name
def drop_city_name(df):
    return df.iloc[:,1:]
drop_city_name_transformer = FunctionTransformer(drop_city_name)

In [24]:
#county_name: impute + encoding
county_name_impute_no_record = SimpleImputer(strategy='constant',fill_value='no_record')
county_name_impute_most_frequent = SimpleImputer(strategy='most_frequent')
county_name_OHE = OneHotEncoder(handle_unknown='ignore')
county_spam_call_record = train_user[['county_name','label']].groupby('county_name').\
                                sum().sort_values(by='label').reset_index()
county_no_spam_call_record = county_spam_call_record.loc[county_spam_call_record['label']==0,'county_name']
def county_name_grouping(county_name):
    index= np.isin(county_name,county_no_spam_call_record)
    new_county_name = np.where(index,'no_past_spam_call_record',county_name)
    return new_county_name
county_name_grouping_transformer = FunctionTransformer(county_name_grouping)

In [25]:
#arpu_20xxxx: impute + feature engineering
arpu_20xxxx_impute = IterativeImputer(random_state=0,\
                                      estimator=DecisionTreeRegressor(max_features='sqrt', random_state=0))
def arpu_20xxxx_compare_mean(arpu_20xxxx):
    mean = arpu_20xxxx.mean()
    return arpu_20xxxx > mean
arpu_20xxxx_compare_mean_transformer = FunctionTransformer(arpu_20xxxx_compare_mean)

In [26]:
#pipeline
county_name_pipe = Pipeline([('impute',county_name_impute_no_record),('encode',county_name_OHE)])
arpu_20xxxx_pipe = Pipeline([('impute',arpu_20xxxx_impute),('transform',arpu_20xxxx_compare_mean_transformer),
                            ('scale',StandardScaler())])
train_user_preprocess_ct = ColumnTransformer([('county_name',county_name_pipe,['county_name']),
                               ('arpu_20xxxx',arpu_20xxxx_pipe,['arpu_201908', 'arpu_201909', 'arpu_201910', 
                                                                'arpu_201911', 'arpu_201912','arpu_202001',
                                                                'arpu_202002', 'arpu_202003'])], 
                               remainder='passthrough',n_jobs=-1)
train_user_preprocess = Pipeline([['drop',drop_city_name_transformer],
                                  ('col_transform',train_user_preprocess_ct)])

In [27]:
models = [RandomForestClassifier(n_jobs=-1,random_state=0),
         LogisticRegression(n_jobs=-1,random_state=0),
         RidgeClassifier(random_state=0)]

In [28]:
model_pipeline = Pipeline([('preprocess',train_user_preprocess),('model',models[1])])

In [29]:
#we verify that our preprocessing is working, we have OHE of county, scaled total spending in a month
#and passthrough idcard_cnt
train_user_preprocess.fit_transform(train_x).todense()[0,:]

matrix([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.

In [30]:
#our baseline score
cross_val_score(model_pipeline, train_x, train_y, n_jobs=-1, cv=4, scoring='f1_macro').mean()

0.7439087059542641

## Best combination

In [31]:
#model_pipeline.get_params()

In [32]:
params = {
'preprocess__col_transform__county_name__impute':[county_name_impute_no_record,
    county_name_impute_most_frequent], #county_name impute: no record or most frequent
'preprocess__col_transform__county_name__encode':[county_name_OHE,
    make_pipeline(county_name_grouping_transformer,county_name_OHE)], #county_name encode: OHE or group then OHE
'preprocess__col_transform__arpu_20xxxx__impute':[arpu_20xxxx_impute,
    SimpleImputer(strategy='mean'), KNNImputer(),
    SimpleImputer(strategy='median')], #arpu_20xxxx impute: by model or by mean
'preprocess__col_transform__arpu_20xxxx__transform':[None,
    arpu_20xxxx_compare_mean_transformer], #arpu_20xxxx feature engineering: raw number or binary encoded
'preprocess__col_transform__arpu_20xxxx__scale': [StandardScaler(), 
                                                  RobustScaler(), MinMaxScaler()], #diff scaler
'model': models
} 

In [33]:
#to compare with baseline logistic model
# params = {
# 'preprocess__col_transform__county_name__impute':[county_name_impute_no_record,
#     county_name_impute_most_frequent], #county_name impute: no record or most frequent
# 'preprocess__col_transform__county_name__encode':[county_name_OHE,
#     make_pipeline(county_name_grouping_transformer,county_name_OHE)], #county_name encode: OHE or group then OHE
# 'preprocess__col_transform__arpu_20xxxx__impute':[arpu_20xxxx_impute,
#     SimpleImputer(strategy='mean'), KNNImputer(),
#     SimpleImputer(strategy='median')], #arpu_20xxxx impute: by model or by mean
# 'preprocess__col_transform__arpu_20xxxx__transform':[None,
#     arpu_20xxxx_compare_mean_transformer], #arpu_20xxxx feature engineering: raw number or binary encoded
# 'preprocess__col_transform__arpu_20xxxx__scale': [StandardScaler(), 
#                                                   RobustScaler(), MinMaxScaler()] #diff scaler
# } 

In [34]:
combi_search = GridSearchCV(model_pipeline,params,scoring='f1_macro',cv=5,n_jobs=-1,verbose=1)

In [35]:
combi_search.fit(train_x,train_y);

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  3.5min finished


In [36]:
results = pd.DataFrame(combi_search.cv_results_).sort_values(by='rank_test_score')

In [37]:
results[results['param_model'] == results['param_model'].unique()[0]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__col_transform__arpu_20xxxx__impute,param_preprocess__col_transform__arpu_20xxxx__scale,param_preprocess__col_transform__arpu_20xxxx__transform,param_preprocess__col_transform__county_name__encode,param_preprocess__col_transform__county_name__impute,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
43,1.255712,0.081891,0.220256,0.003634,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(),MinMaxScaler(),,(FunctionTransformer(func=<function county_nam...,SimpleImputer(strategy='most_frequent'),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.850404,0.886868,0.885171,0.873788,0.854434,0.870133,0.0152,1
35,1.377542,0.084891,0.224064,0.010798,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(),RobustScaler(),,(FunctionTransformer(func=<function county_nam...,SimpleImputer(strategy='most_frequent'),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.850404,0.886868,0.885171,0.873788,0.854434,0.870133,0.0152,1
27,1.268626,0.049832,0.217957,0.003197,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(),StandardScaler(),,(FunctionTransformer(func=<function county_nam...,SimpleImputer(strategy='most_frequent'),"{'model': RandomForestClassifier(n_jobs=-1, ra...",0.850404,0.886868,0.885171,0.873788,0.853075,0.869861,0.015488,3
24,1.033141,0.067154,0.2195,0.003463,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(),StandardScaler(),,OneHotEncoder(handle_unknown='ignore'),"SimpleImputer(fill_value='no_record', strategy...","{'model': RandomForestClassifier(n_jobs=-1, ra...",0.852579,0.877885,0.88599,0.869234,0.850509,0.867239,0.013883,4
32,1.233067,0.085539,0.217383,0.001012,"RandomForestClassifier(n_jobs=-1, random_state=0)",SimpleImputer(),RobustScaler(),,OneHotEncoder(handle_unknown='ignore'),"SimpleImputer(fill_value='no_record', strategy...","{'model': RandomForestClassifier(n_jobs=-1, ra...",0.852579,0.877885,0.88599,0.869234,0.850509,0.867239,0.013883,4


In [38]:
results[results['param_model'] == results['param_model'].unique()[1]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__col_transform__arpu_20xxxx__impute,param_preprocess__col_transform__arpu_20xxxx__scale,param_preprocess__col_transform__arpu_20xxxx__transform,param_preprocess__col_transform__county_name__encode,param_preprocess__col_transform__county_name__impute,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
127,0.218719,0.002617,0.110802,0.002244,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(),StandardScaler(),FunctionTransformer(func=<function arpu_20xxxx...,(FunctionTransformer(func=<function county_nam...,SimpleImputer(strategy='most_frequent'),"{'model': LogisticRegression(n_jobs=-1, random...",0.788918,0.789408,0.845689,0.829652,0.806509,0.812035,0.022449,49
135,0.216588,0.002085,0.111673,0.003045,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(),RobustScaler(),FunctionTransformer(func=<function arpu_20xxxx...,(FunctionTransformer(func=<function county_nam...,SimpleImputer(strategy='most_frequent'),"{'model': LogisticRegression(n_jobs=-1, random...",0.788918,0.789408,0.843925,0.826527,0.807798,0.811315,0.021395,50
143,0.217033,0.002714,0.111526,0.001329,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(),MinMaxScaler(),FunctionTransformer(func=<function arpu_20xxxx...,(FunctionTransformer(func=<function county_nam...,SimpleImputer(strategy='most_frequent'),"{'model': LogisticRegression(n_jobs=-1, random...",0.788918,0.789408,0.843925,0.826527,0.807798,0.811315,0.021395,50
126,0.217833,0.001924,0.112033,0.002144,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(),StandardScaler(),FunctionTransformer(func=<function arpu_20xxxx...,(FunctionTransformer(func=<function county_nam...,"SimpleImputer(fill_value='no_record', strategy...","{'model': LogisticRegression(n_jobs=-1, random...",0.792117,0.782987,0.845689,0.829197,0.806509,0.8113,0.023219,52
142,0.219129,0.001666,0.112283,0.0044,"LogisticRegression(n_jobs=-1, random_state=0)",SimpleImputer(),MinMaxScaler(),FunctionTransformer(func=<function arpu_20xxxx...,(FunctionTransformer(func=<function county_nam...,"SimpleImputer(fill_value='no_record', strategy...","{'model': LogisticRegression(n_jobs=-1, random...",0.790816,0.782987,0.847046,0.829197,0.801696,0.810349,0.024112,61


In [39]:
results[results['param_model'] == results['param_model'].unique()[2]].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_preprocess__col_transform__arpu_20xxxx__impute,param_preprocess__col_transform__arpu_20xxxx__scale,param_preprocess__col_transform__arpu_20xxxx__transform,param_preprocess__col_transform__county_name__encode,param_preprocess__col_transform__county_name__impute,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
229,0.120567,0.003059,0.109674,0.000745,RidgeClassifier(random_state=0),SimpleImputer(),RobustScaler(),FunctionTransformer(func=<function arpu_20xxxx...,OneHotEncoder(handle_unknown='ignore'),SimpleImputer(strategy='most_frequent'),"{'model': RidgeClassifier(random_state=0), 'pr...",0.783848,0.791281,0.816276,0.832308,0.782361,0.801215,0.019745,79
237,0.123723,0.002727,0.110619,0.001063,RidgeClassifier(random_state=0),SimpleImputer(),MinMaxScaler(),FunctionTransformer(func=<function arpu_20xxxx...,OneHotEncoder(handle_unknown='ignore'),SimpleImputer(strategy='most_frequent'),"{'model': RidgeClassifier(random_state=0), 'pr...",0.783848,0.791281,0.816276,0.832308,0.782361,0.801215,0.019745,79
231,0.12063,0.00297,0.110884,0.002188,RidgeClassifier(random_state=0),SimpleImputer(),RobustScaler(),FunctionTransformer(func=<function arpu_20xxxx...,(FunctionTransformer(func=<function county_nam...,SimpleImputer(strategy='most_frequent'),"{'model': RidgeClassifier(random_state=0), 'pr...",0.785145,0.792575,0.816276,0.82867,0.781748,0.800883,0.018395,81
239,0.12754,0.004906,0.112654,0.003719,RidgeClassifier(random_state=0),SimpleImputer(),MinMaxScaler(),FunctionTransformer(func=<function arpu_20xxxx...,(FunctionTransformer(func=<function county_nam...,SimpleImputer(strategy='most_frequent'),"{'model': RidgeClassifier(random_state=0), 'pr...",0.785145,0.792575,0.816276,0.82867,0.781748,0.800883,0.018395,81
221,0.120577,0.003225,0.109793,0.002617,RidgeClassifier(random_state=0),SimpleImputer(),StandardScaler(),FunctionTransformer(func=<function arpu_20xxxx...,OneHotEncoder(handle_unknown='ignore'),SimpleImputer(strategy='most_frequent'),"{'model': RidgeClassifier(random_state=0), 'pr...",0.782554,0.791281,0.816276,0.830019,0.782361,0.800498,0.019268,83


# Best way to handle train_user

Therefore, we have concluded we will:
city_name:

- drop

county_name:

- impute by most frequent → confirm
- encode by grouping first → highly certain

idcard_cnt:

- do nothing

arpu_20xxxx:

- impute by mean → confirm
- transform into higher or lower than mean → uncertain, rf doesn't need but linear model does
- scaler (MinMax/Robust) → uncertain

In [40]:
best_model = combi_search.best_estimator_

In [41]:
#our baseline score was: 0.7439087059542641
cross_val_score(best_model, train_x, train_y, n_jobs=-1, cv=4, scoring='f1_macro').mean()

0.871036790680468

In [42]:
#when comparing with best logistic model (by not search over model space)
print(0.7699763164728499 - 0.7439087059542641) #improve score by 0.026
print((0.7699763164728499 - 0.7439087059542641)/0.7439087059542641 * 100) #or 3.5% improvement

0.02606761051858586
3.5041410740242784


In [43]:
#when comparing with best model
print(0.871036790680468 - 0.7439087059542641) #improve score by 0.127
print((0.871036790680468 - 0.7439087059542641)/0.7439087059542641 * 100) #or 17% improvement

0.12712808472620396
17.08920512808461


# Final result

We will now use our validation set to estimate our model's performance in the real world.\
We will not make any changes to our model after this stage to avoid data leakage

In [45]:
prediction = best_model.predict(test_x)

In [46]:
f1_score(test_y, prediction, average='macro')

0.872883820603092

In [48]:
confusion_matrix(test_y, prediction)

array([[644,  20],
       [ 82, 231]])

In [50]:
print(classification_report(test_y, prediction))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       664
           1       0.92      0.74      0.82       313

    accuracy                           0.90       977
   macro avg       0.90      0.85      0.87       977
weighted avg       0.90      0.90      0.89       977



Currently (18 Jun) the highest score in ranking has a macro f1-score of 0.94257\
We need to improve our f1-score by 0.07 to achieve this.

The weakness of our model right now is in:
1. recall for spam calls (not able to catch all the spam callers)
2. accuracy for non spam calls (we wrongly identify spam callers as non spam callers)
3. precision for spam calls (we can still improve the accuracy for our spam caller detection)

Lucky for us, so far we have only utilised the origin of the phone number, the place the phone was registered in and the monthly spending. We still have voice, sms and apps data to add on to our model.