In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from training_and_evluation import TrainingAndEvaluation
from IPython.display import display
from data_enrichment import DataEnrichment

In [4]:
tae = TrainingAndEvaluation()
de = DataEnrichment()

##### In the following lines we will try to create classification model based on past sections.
##### Each step will add another sub sections in order to try to improve our model.
##### First we splitting our data to train\test. we will not touch the test till the last check. meaning that the 1/0 proportion will remain the same over the test set.
##### We will use the F1 as our main measurement, since our data is extremely bias we want to create model that can ==pick== the Y=1 accurately, meaning, without class all as 1. And that because f1 is the measurement which combine between our ability to class points as 1 while taking into consideration how many zeros point we classify as 1. in other words the combination of precision and recall. correspond to product needs we can use f_i measurements in order to increase recall/precision importance.

##### The following steps:  --1) Read and splitting data, -- 2) Run model on raw data, -- 3) Run model after feature selections, -- 4) Run model after resampling -- 5) Summary and a beginning of algorithm optimization

### 1) Read data and split  

In [5]:
df = pd.read_csv('./data/df_before_encoding.csv', index_col=[0])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12397 entries, -9.16005e+18_2.49201e+18 to -9.03879e+18_-6.86341e+18
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   state                    12397 non-null  object 
 1   product                  12397 non-null  object 
 2   square_ft                12397 non-null  float64
 3   has_fire_alarm           12397 non-null  bool   
 4   has_burglar_alarm        12397 non-null  bool   
 5   portable_electronics     12397 non-null  bool   
 6   coast                    12397 non-null  int64  
 7   fire_housing_proximity   12397 non-null  int64  
 8   previous_policies        12397 non-null  int64  
 9   user_age                 12397 non-null  float64
 10  card_type                12396 non-null  object 
 11  label                    12397 non-null  bool   
 12  number_of_na             12397 non-null  int64  
 13  grouped_postal_code      12397 non-nul

In [6]:
X_train, X_test, y_train, y_test  = tae.train_test_split(df = df, labeled_col_name = 'label', test_size = 0.15,random_state=20)

### Run models on raw data
##### Lets train models on our raw data after feature engineering only, meaning, there is no feature selection or some reference to our bais probelm

In [7]:
df_r,model = tae.generate_f1_and_confusion_matrix(X_train = X_train,
                                                X_test = X_test,
                                                y_train = y_train,
                                                y_test = y_test,
                                                df_col_types=df)
df_r

Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.037037,0.552941,0.066982,"[[1807, 11], [41, 1]]"


## 2) Run models after feature selection

#### sensative features

###### read sensative features 

In [8]:
sensative_features = tae.read_list(path ='./data/sensative_features.txt')
sensative_features = [i.split(',') for i in sensative_features]
sensative_features

[['median_household_income']]

In [9]:
for i in sensative_features:
    # each iteration taking sensative features only
    df_r,model = tae.generate_f1_and_confusion_matrix(X_train = X_train[i],
                                                    X_test = X_test[i],
                                                    y_train = y_train,
                                                    y_test = y_test,
                                                    df_col_types=X_train[i])
    print(f'\n {i} \n')
    display(df_r)
print('Looks like f1 increaed and all the algorithms performed the same on the test, but some has higher diff from train')


 ['median_household_income'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.048193,0.041408,1.163855,"[[1779, 39], [40, 2]]"


Looks like f1 increaed and all the algorithms performed the same on the test, but some has higher diff from train


### Feature importance 

In [10]:
df_feature_importence = pd.read_csv('./data/df_feature_importence.csv').drop(columns =['Unnamed: 0'])

# Top n combinations 
top_n = [df_feature_importence.feature[0:n].tolist() for n in  range(2,13,3)]

#Because we trained on encoded features we need to do simple manipulation
top_nn_f_importance = []
cols = X_train.columns
for i in top_n:
    l = []
    for j in i :
        for c in cols:
            if c in j:
                l.append(c)
    top_nn_f_importance.append(list(set(l)))


In [11]:
columns  = X_train.columns 
for i in top_nn_f_importance:
    print(f'\n {i} \n')

    # each iteration choos top n importent features
    df,model = tae.generate_f1_and_confusion_matrix(X_train = X_train[i],
                                                X_test = X_test[i],
                                                y_train = y_train,
                                                y_test = y_test,
                                                df_col_types=X_test[i])
    display(df)
    
print("gbc performed the best with 10 features, offline i tried to combine the 'sensitive' features, but the influence was negative")


 ['user_age', 'median_household_income'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.042121,0.0459,0.91766,"[[512, 1306], [13, 29]]"



 ['fire_housing_proximity', 'previous_policies', 'user_age', 'coast', 'median_household_income'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.03132,0.045907,0.682252,"[[1420, 398], [35, 7]]"



 ['fire_housing_proximity', 'number_of_na', 'previous_policies', 'has_fire_alarm', 'user_age', 'has_burglar_alarm', 'coast', 'median_household_income'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.0,0.056537,0.0,"[[1767, 51], [42, 0]]"



 ['fire_housing_proximity', 'state', 'number_of_na', 'previous_policies', 'has_fire_alarm', 'user_age', 'has_burglar_alarm', 'portable_electronics', 'coast', 'median_household_income'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.042865,0.044639,0.960257,"[[125, 1693], [4, 38]]"


gbc performed the best with 10 features, offline i tried to combine the 'sensitive' features, but the influence was negative


### Regularization penalty - L1 - Lasso

In [12]:
df_lasso_abs = pd.read_csv('./data/lasso_abs.csv').drop(columns =['Unnamed: 0'])
display(df_lasso_abs.head(4))

# Top n combinations 
top_n = [df_lasso_abs.feature[0:n].tolist() for n in  range(2,25,5)]

#Because we The Betas are per featore categoric we need to do simple manipulation
top_nn_lasso = []
cols = X_train.columns
for i in top_n:
    l = []
    for j in i :
        for c in cols:
            if c in j:
                l.append(c)
    top_nn_lasso.append(list(set(l)))


Unnamed: 0,feature,coefs
0,state_NV,0.718581
1,card_type_credit,0.638399
2,portable_electronics,0.634238
3,state_MD,0.623242


In [13]:
for i in top_nn_lasso:
    print(f'\n {i} \n')

    # each iteration choos top n importent features
    df,model = tae.generate_f1_and_confusion_matrix(X_train = X_train[i],
                                                X_test = X_test[i],
                                                y_train = y_train,
                                                y_test = y_test,
                                                df_col_types=X_test[i])
    display(df)
    


 ['state', 'card_type'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.0,0.0,,"[[1818, 0], [42, 0]]"



 ['state', 'card_type', 'portable_electronics', 'square_ft'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.031359,0.057497,0.545399,"[[1295, 523], [33, 9]]"



 ['state', 'card_type', 'product', 'square_ft', 'previous_policies', 'has_burglar_alarm', 'portable_electronics'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.044444,0.078125,0.568889,"[[1816, 2], [41, 1]]"



 ['state', 'card_type', 'product', 'square_ft', 'previous_policies', 'has_fire_alarm', 'has_burglar_alarm', 'portable_electronics'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.018083,0.037348,0.484177,"[[1312, 506], [37, 5]]"



 ['state', 'card_type', 'product', 'square_ft', 'previous_policies', 'has_fire_alarm', 'has_burglar_alarm', 'portable_electronics', 'coast', 'number_of_na'] 



Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.0,0.0,,"[[1818, 0], [42, 0]]"
SVC(),0.0,0.0,,"[[1818, 0], [42, 0]]"
GradientBoostingClassifier(),0.02611,0.03313,0.788099,"[[1482, 336], [37, 5]]"


## 3) Resampling

#### resampeling and adding noise 
###### For continuous faetures: adding normal noise with mu = 0 and sigma  = feature standard deviation
###### For discrete features: adding noise from discrete uniform distribution {-2,...,2}

In [14]:
train_resampled_k_means_shuffled  = pd.read_csv('./data/df_resampled.csv', index_col=[0])
y_train_resampled = train_resampled_k_means_shuffled.label
X_train_resampled = train_resampled_k_means_shuffled.drop(columns = 'label')

In [15]:
d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled, 
                                               X_test  = X_test, 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_train
                                                )
print("Resampeling with noise and all features results:\n\n")
display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[top_nn_f_importance[3]], 
                                               X_test  = X_test[top_nn_f_importance[3]],
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_train_resampled[top_nn_f_importance[3]],
                                              )
print("Resampeling with noise and feature importence results:\n\n")
display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[top_nn_lasso[2]], 
                                               X_test  = X_test[top_nn_lasso[2]], 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_test[top_nn_lasso[2]])

print("Resampeling with noise and top n Lasso festures results:\n\n")

display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[sensative_features[0]], 
                                               X_test  = X_test[sensative_features[0]], 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_test[sensative_features[0]])

print("Resampeling with noise and top sensative feature:\n\n")
display(d)

Resampeling with noise and all features results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.03937,0.708583,0.055562,"[[1362, 456], [32, 10]]"
SVC(),0.028571,0.949563,0.030089,"[[1791, 27], [41, 1]]"
GradientBoostingClassifier(),0.039216,0.988433,0.039675,"[[1810, 8], [41, 1]]"


Resampeling with noise and feature importence results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.054054,0.689557,0.078389,"[[1320, 498], [27, 15]]"
SVC(),0.06383,0.937364,0.068095,"[[1769, 49], [39, 3]]"
GradientBoostingClassifier(),0.068966,0.986257,0.069927,"[[1804, 14], [40, 2]]"


Resampeling with noise and top n Lasso festures results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.049655,0.628252,0.079037,"[[1153, 665], [24, 18]]"
SVC(),0.040609,0.850157,0.047767,"[[1667, 151], [38, 4]]"
GradientBoostingClassifier(),0.115385,0.98733,0.116865,"[[1811, 7], [39, 3]]"


Resampeling with noise and top sensative feature:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.040977,0.555133,0.073815,"[[617, 1201], [16, 26]]"
SVC(),0.042553,0.531056,0.080129,"[[802, 1016], [19, 23]]"
GradientBoostingClassifier(),0.045082,0.808041,0.055792,"[[1383, 435], [31, 11]]"


### feature shuffelling - previous_policies

In [16]:
train_resampled_previous_policies_shuffled  = pd.read_csv('./data/train_resampled_previous_policies.csv', index_col=[0])
y_train_resampled = train_resampled_previous_policies_shuffled.label
X_train_resampled = train_resampled_previous_policies_shuffled.drop(columns = 'label')


In [17]:
d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled, 
                                               X_test  = X_test, 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_train
                                                )
print("Resampeling with previous_policies shuffelling and all features results:\n\n")
display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[top_nn_f_importance[3]], 
                                               X_test  = X_test[top_nn_f_importance[3]],
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_train_resampled[top_nn_f_importance[3]],
                                              )
print("Resampeling with previous_policies shuffelling and feature importence results:\n\n")
display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[top_nn_lasso[2]], 
                                               X_test  = X_test[top_nn_lasso[2]], 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_test[top_nn_lasso[2]])

print("Resampeling with previous_policies shuffelling and top n Lasso festures results:\n\n")

display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[sensative_features[0]], 
                                               X_test  = X_test[sensative_features[0]], 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_test[sensative_features[0]])

print("Resampeling with previous_policies shuffelling and top sensative feature:\n\n")
display(d)

Resampeling with previous_policies shuffelling and all features results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.050265,0.595244,0.084444,"[[1123, 695], [23, 19]]"
SVC(),0.014184,0.884004,0.016046,"[[1720, 98], [41, 1]]"
GradientBoostingClassifier(),0.027397,0.97844,0.028001,"[[1788, 30], [41, 1]]"


Resampeling with previous_policies shuffelling and feature importence results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.052786,0.589542,0.089537,"[[1196, 622], [24, 18]]"
SVC(),0.025478,0.834336,0.030537,"[[1705, 113], [40, 2]]"
GradientBoostingClassifier(),0.030075,0.954713,0.031502,"[[1729, 89], [40, 2]]"


Resampeling with previous_policies shuffelling and top n Lasso festures results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.052521,0.583515,0.090008,"[[933, 885], [17, 25]]"
SVC(),0.04878,0.795962,0.061285,"[[1660, 158], [37, 5]]"
GradientBoostingClassifier(),0.028571,0.91314,0.031289,"[[1791, 27], [41, 1]]"


Resampeling with previous_policies shuffelling and top sensative feature:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.044092,0.581307,0.075849,"[[751, 1067], [17, 25]]"
SVC(),0.043384,0.619537,0.070026,"[[507, 1311], [12, 30]]"
GradientBoostingClassifier(),0.053333,0.89827,0.059373,"[[1495, 323], [32, 10]]"


### feature shuffelling - kmeans

In [18]:
train_resampled_k_means_shuffled  = pd.read_csv('./data/train_resampled_k_means_shuffled.csv', index_col=[0])
y_train_resampled = train_resampled_k_means_shuffled.label
X_train_resampled = train_resampled_k_means_shuffled.drop(columns = 'label')

In [19]:
d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled, 
                                               X_test  = X_test, 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_train
                                                )
print("Resampeling with K-means shuffelling and all features results:\n\n")
display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[top_nn_f_importance[3]], 
                                               X_test  = X_test[top_nn_f_importance[3]],
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_train_resampled[top_nn_f_importance[3]],
                                              )
print("Resampeling with K-means shuffelling and feature importence results:\n\n")
display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[top_nn_lasso[2]], 
                                               X_test  = X_test[top_nn_lasso[2]], 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_test[top_nn_lasso[2]])

print("Resampeling with K-means shuffelling and top n Lasso festures results:\n\n")

display(d)

d,model = tae.generate_f1_and_confusion_matrix(X_train = X_train_resampled[sensative_features[0]], 
                                               X_test  = X_test[sensative_features[0]], 
                                               y_train = y_train_resampled, 
                                               y_test  = y_test,
                                               df_col_types = X_test[sensative_features[0]])

print("Resampeling with K-means shuffelling and top sensative feature:\n\n")
display(d)

Resampeling with K-means shuffelling and all features results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.05163,0.638543,0.080857,"[[1143, 675], [23, 19]]"
SVC(),0.032258,0.84896,0.037997,"[[1677, 141], [39, 3]]"
GradientBoostingClassifier(),0.032787,0.96755,0.033886,"[[1740, 78], [40, 2]]"


Resampeling with K-means shuffelling and feature importence results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.05686,0.606041,0.093823,"[[1074, 744], [19, 23]]"
SVC(),0.045283,0.783304,0.05781,"[[1601, 217], [36, 6]]"
GradientBoostingClassifier(),0.063291,0.945547,0.066936,"[[1707, 111], [37, 5]]"


Resampeling with K-means shuffelling and top n Lasso festures results:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.050602,0.65027,0.077818,"[[1051, 767], [21, 21]]"
SVC(),0.039312,0.759168,0.051783,"[[1461, 357], [34, 8]]"
GradientBoostingClassifier(),0.051948,0.79658,0.065214,"[[1710, 108], [38, 4]]"


Resampeling with K-means shuffelling and top sensative feature:




Unnamed: 0,f1_test,f1_train,train_test_ratio,confusion_matrix_test
LogisticRegression(),0.042553,0.586889,0.072506,"[[710, 1108], [17, 25]]"
SVC(),0.044476,0.629182,0.070689,"[[497, 1321], [11, 31]]"
GradientBoostingClassifier(),0.056122,0.900786,0.062304,"[[1479, 339], [31, 11]]"


### Looks like the most helpful actions that help in  increas f1 = 0.115 are :
##### - 1) resample - 2) shuffling with respect to previous_policies - 3) Taking lasso most importent features.
### Of Course there is more several necessary steps that should be taken.
#####  1) Pick and dive to one algorithm ,cross validate and search (grid search) for optimal params.
#####  3) Resample more points which are failed to classify.
#####  4) Clean exceptional points.
#####  5) Try to bin some of our continues features.
#####  6) Use another unsupervised learning techniques (or even k-mean with more higher k ) for grouping and shuffling.
#####  7) In the next section "f_production_trining" we will pick one of our algorithms, apply grid search and save the results for production serving.

