In [360]:
#Packages
import pandas as pd
import copy
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import umap
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [361]:
#Training Data
data = pd.read_csv('C:/BITsPilaniMTECH2123/Interview_Tests/DS_exercise/data_2016_new.csv')
print(f'Shape of X= {data.shape}')
data.head()

Shape of X= (200000, 56)


Unnamed: 0,masked_customer_id,jaar,bought_highbrow_wines,Collishop_customer,cat_AP_STDR_PortoONLINE,cat_AP_STDR_WhiskyONLINE,cat_Babyluiers,cat_Ber_Ger_DVPortioneerbaar,cat_Ber_Ger_VersMaaltijdsalades,cat_Bier_Genietbieren,...,cat_nfokay,total_discount,rev_ticket,prod_ticket,n_cogo,cogo_rev,HOUSEHOLDTYPOLOGY,price_sens_colr,SOW_type_colr,SOW_colr
0,339806,2016,0.0,N,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,49.87,6,0,0.0,g_HHnochild_55_plus,-0.1762,SOW20-30,20
1,339807,2016,0.0,N,0.0,0.0,0.0,0.0,0.0,24.74,...,0.0,-9.58,12.841333,2,0,0.0,f_HHnochild_35_54,-0.2743,SOW10-20,15
2,339809,2016,0.0,N,0.0,0.0,0.0,4.99,3.69,0.0,...,0.0,0.0,4.92375,1,0,0.0,j_HHchild_oldest_6_12,-0.1281,SOW40-50,42
3,339812,2016,0.0,N,0.0,0.0,0.0,0.0,0.0,13.08,...,0.0,-7.0,6.221765,1,0,0.0,g_HHnochild_55_plus,-0.3344,SOW20-30,22
4,339815,2016,0.0,N,0.0,0.0,0.0,0.0,0.0,31.92,...,0.0,-2.03,8.07,4,0,0.0,k_HHchild_oldest_13_17,-0.5497,SOW70-80,71


In [362]:
# Exploring NA and Treatment for missing values
data.isnull().sum()
#Note: Very few NA/Missing Values in our dataset,Will drop rows, No Imputation/Treatment

masked_customer_id                 0
jaar                               0
bought_highbrow_wines              4
Collishop_customer                 2
cat_AP_STDR_PortoONLINE            0
cat_AP_STDR_WhiskyONLINE           0
cat_Babyluiers                     0
cat_Ber_Ger_DVPortioneerbaar       0
cat_Ber_Ger_VersMaaltijdsalades    0
cat_Bier_Genietbieren              0
cat_Bloemen                        0
cat_Bot_Mar_Boter                  0
cat_BroodKorthoudbaar              0
cat_Chips                          0
cat_ColruytMobile_Toestellen       0
cat_DeegDV                         0
cat_EleKtroKeuken                  0
cat_Houtpelletskolen_briketten     0
cat_Incontinentie_luiers           0
cat_KaasSeizoenskazen              0
cat_Kauwgum                        0
cat_KoudeSauzen                    0
cat_MelkKarnemelk                  0
cat_Notengedroogdfruit_groenten    0
cat_Ontbijtgranen_Volwassenen      0
cat_ParfumerieEHBO                 0
cat_Tapas                          0
c

In [363]:
data = data.dropna().reset_index(drop=True)
data.head(2)

Unnamed: 0,masked_customer_id,jaar,bought_highbrow_wines,Collishop_customer,cat_AP_STDR_PortoONLINE,cat_AP_STDR_WhiskyONLINE,cat_Babyluiers,cat_Ber_Ger_DVPortioneerbaar,cat_Ber_Ger_VersMaaltijdsalades,cat_Bier_Genietbieren,...,cat_nfokay,total_discount,rev_ticket,prod_ticket,n_cogo,cogo_rev,HOUSEHOLDTYPOLOGY,price_sens_colr,SOW_type_colr,SOW_colr
0,339806,2016,0.0,N,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,49.87,6,0,0.0,g_HHnochild_55_plus,-0.1762,SOW20-30,20
1,339807,2016,0.0,N,0.0,0.0,0.0,0.0,0.0,24.74,...,0.0,-9.58,12.841333,2,0,0.0,f_HHnochild_35_54,-0.2743,SOW10-20,15


In [364]:
len(data.masked_customer_id.unique())

199994

In [365]:
#Data type conversions
def convert_datatype(df,int_cols,cat_cols):
    for col in int_cols:
        df[col] = df[col].astype(int)
    for col in cat_cols:
        df[col] = df[col].astype('category')
    return df

data = data.convert_dtypes()
int_cols = data.select_dtypes(exclude=['string','boolean']).columns
cat_cols = data.select_dtypes(include=['string','boolean']).columns
data = convert_datatype(data,int_cols,cat_cols)

In [366]:
#Droping masked_customer_id and jaar as these will not contribute to target
data = data.drop(['masked_customer_id','jaar'],axis=1)
data.shape

(199994, 54)

In [368]:
#Target: Class is imbalance with 4.5% appr. of 1 wrt 95% of 0
data['bought_highbrow_wines'].value_counts()

0    190035
1      9959
Name: bought_highbrow_wines, dtype: int64

In [369]:
#Features Independent and Target
X=data
X = X.drop(['bought_highbrow_wines'],axis=1)
y=data['bought_highbrow_wines']

### Multi-collinearity

In [372]:
#VIF
XI = X.drop(['Collishop_customer','HOUSEHOLDTYPOLOGY','SOW_type_colr'],axis=1)
VIF = {XI.columns[i]: variance_inflation_factor(XI.values, i) for i in range(1, XI.shape[1])}
pd.DataFrame({'VIF': VIF}, index=XI.columns)

Unnamed: 0,VIF
cat_AP_STDR_PortoONLINE,
cat_AP_STDR_WhiskyONLINE,1.019565
cat_Babyluiers,1.230035
cat_Ber_Ger_DVPortioneerbaar,1.035431
cat_Ber_Ger_VersMaaltijdsalades,1.052837
cat_Bier_Genietbieren,-0.093743
cat_Bloemen,1.121407
cat_Bot_Mar_Boter,1.480668
cat_BroodKorthoudbaar,1.217723
cat_Chips,1.75656


In [373]:
#Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size = 0.15, random_state=123,stratify=y)

In [374]:
cat_features= ['cat_AP_STDR_PortoONLINE', 'cat_AP_STDR_WhiskyONLINE', 'cat_Babyluiers',
       'cat_Ber_Ger_DVPortioneerbaar', 'cat_Ber_Ger_VersMaaltijdsalades',
       'cat_Bier_Genietbieren', 'cat_Bloemen', 'cat_Bot_Mar_Boter',
       'cat_BroodKorthoudbaar', 'cat_Chips', 'cat_ColruytMobile_Toestellen',
       'cat_DeegDV', 'cat_EleKtroKeuken', 'cat_Houtpelletskolen_briketten',
       'cat_Incontinentie_luiers', 'cat_KaasSeizoenskazen', 'cat_Kauwgum',
       'cat_KoudeSauzen', 'cat_MelkKarnemelk',
       'cat_Notengedroogdfruit_groenten', 'cat_Ontbijtgranen_Volwassenen',
       'cat_ParfumerieEHBO', 'cat_Tapas', 'cat_Textiel_Bedlinnen',
       'cat_Textiel_Herenondergoed', 'cat_Textiel_Pantys',
       'cat_VNCBGBereidegerechten', 'cat_VNCBerBurgers',
       'cat_VNCCharBHWildpasteien', 'cat_VNCFSalades', 'cat_VNCGevgeheel',
       'cat_VNCKalfStoofvlees', 'cat_VNCLamSnedenkoteletsteak', 'cat_VNCRest',
       'cat_VNCVarkenRest', 'cat_VNCWildSteak', 'cat_VerseKaasFruitkazen',
       'cat_VisGerookt', 'cat_VisVerseSchelpdieren',
       'cat_Wijn_Stillewijnen_RAYON', 'cat_Zomerspeelgoed', 'cat_bbqfoodevent',
       'cat_nfokay']

In [375]:
# Oversampling SMOTENC
#smote_nc = SMOTENC(categorical_features=cat_features,random_state=123,sampling_strategy=0.3)
#X_train_Ov,y_train_Ov = smote_nc.fit(X_train,y_train)

In [376]:
# Undersampling RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy="majority")
undersampled_data, undersampled_target = undersampler.fit_resample(X_train, y_train)

In [377]:
#Updated count of rows
undersampled_data.shape
undersampled_target.shape

(16930,)

In [378]:
#Dimension Reduction
X_train_fd = copy.copy(undersampled_data)
y_train_fd = copy.copy(undersampled_target)
X_test_fd = copy.copy(X_test)
y_test_fd = copy.copy(y_test)

In [379]:
#one-hot encoding
X_trainUnder = pd.get_dummies(undersampled_data)
y_trainUnder= undersampled_target
X_test = pd.get_dummies(X_test)
y_test
X_train.head(2)

Unnamed: 0,Collishop_customer,cat_AP_STDR_PortoONLINE,cat_AP_STDR_WhiskyONLINE,cat_Babyluiers,cat_Ber_Ger_DVPortioneerbaar,cat_Ber_Ger_VersMaaltijdsalades,cat_Bier_Genietbieren,cat_Bloemen,cat_Bot_Mar_Boter,cat_BroodKorthoudbaar,...,cat_nfokay,total_discount,rev_ticket,prod_ticket,n_cogo,cogo_rev,HOUSEHOLDTYPOLOGY,price_sens_colr,SOW_type_colr,SOW_colr
14660,N,0,0,0,0,0,0,0,9,0,...,0,-7,10,2,0,0,f_HHnochild_35_54,0,SOW20-30,25
141973,N,0,0,0,0,0,0,0,22,0,...,0,-18,30,4,0,0,f_HHnochild_35_54,0,SOW_100+,111


In [380]:
#Standard Scaler
scaler=StandardScaler()
#Train:
#train_scaled = scaler.fit_transform(X_trainUnder[['SOW_colr','price_sens_colr','cogo_rev','n_cogo','prod_ticket','rev_ticket','total_discount']])
#ss1 = pd.DataFrame(train_scaled,columns=['SOW_colr','price_sens_colr','cogo_rev','n_cogo','prod_ticket','rev_ticket','total_discount'])
#X_trainUnder = X_trainUnder.drop(['SOW_colr','price_sens_colr','cogo_rev','n_cogo','prod_ticket','rev_ticket','total_discount'],axis=1)
#X_trainUnder = X_trainUnder.reset_index(drop=True)
#ss1 = ss1.reset_index(drop=True)
#X_trainUnder=pd.concat([X_trainUnder,ss1],axis=1)
print("Shape Train Independent: ",X_trainUnder.shape)
print("Shape Train Target: ",y_trainUnder.shape)

#Test:
#test_scaled = scaler.fit_transform(X_test[['SOW_colr','price_sens_colr','cogo_rev','n_cogo','prod_ticket','rev_ticket','total_discount']])
#ss2 = pd.DataFrame(test_scaled,columns=['SOW_colr','price_sens_colr','cogo_rev','n_cogo','prod_ticket','rev_ticket','total_discount'])
#X_test = X_test.drop(['SOW_colr','price_sens_colr','cogo_rev','n_cogo','prod_ticket','rev_ticket','total_discount'],axis=1)
#X_test = X_test.reset_index(drop=True)
#ss2 = ss2.reset_index(drop=True)
#X_test = pd.concat([X_test,ss2],axis=1)
print("Shape Test Independent: ",X_test.shape)
print("Shape Test Target: ",y_test.shape)


Shape Train Independent:  (16930, 78)
Shape Train Target:  (16930,)
Shape Test Independent:  (30000, 78)
Shape Test Target:  (30000,)


In [381]:
#DR
um = umap.UMAP(n_components=3)
X_fit = um.fit(X_trainUnder[cat_features])    
#Train
train_umap = um.transform(X_trainUnder[cat_features])
train_umap = pd.DataFrame(data = train_umap, columns = ['umap_f1', 'umap_f2','umap_f3'])
train_umap = train_umap.reset_index(drop=True)
X_trainUnder = X_trainUnder.drop(cat_features,axis=1)
X_trainUnder = X_trainUnder.reset_index(drop=True)
X_trainUnder = pd.concat([X_trainUnder,train_umap],axis=1)
print("Shape Train Independent: ",X_trainUnder.shape)
print("Shape Train Target: ",y_trainUnder.shape)

#Test
test_umap = um.transform(X_test[cat_features])
test_umap = pd.DataFrame(data = test_umap, columns = ['umap_f1', 'umap_f2','umap_f3'])
test_umap = test_umap.reset_index(drop=True)

X_test = X_test.drop(cat_features,axis=1)
X_test = X_test.reset_index(drop=True)
X_test = pd.concat([X_test,test_umap],axis=1)
print("Shape Train Independent: ",X_test.shape)
print("Shape Train Target: ",y_test.shape)


Shape Train Independent:  (16930, 38)
Shape Train Target:  (16930,)


  self._set_arrayXarray(i, j, x)


Shape Train Independent:  (30000, 38)
Shape Train Target:  (30000,)


In [382]:
#Before Under sampling
y_train.value_counts()

0    161529
1      8465
Name: bought_highbrow_wines, dtype: int64

In [383]:
#After Under sampling
y_trainUnder.value_counts()

0    8465
1    8465
Name: bought_highbrow_wines, dtype: int64

In [384]:
y_test.shape

(30000,)

###  XGBoost Classifer

In [385]:
#XGBoost Classifer
model1 = XGBClassifier(use_label_encoder=False)
model1.fit(X_trainUnder, y_trainUnder)



In [386]:
#Prediction
y_pred = model1.predict(X_test)
output1=pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}\n")
output1

Confusion Matrix: 
 [[23661  4845]
 [  244  1250]]



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.989793,0.205086,0.830367,0.59744,0.950715
recall,0.830036,0.83668,0.830367,0.833358,0.830367
f1-score,0.902902,0.329424,0.830367,0.616163,0.874343
support,28506.0,1494.0,0.830367,30000.0,30000.0


In [387]:
test_score = accuracy_score(y_test, model1.predict(X_test)) * 100
train_score = accuracy_score(y_trainUnder, model1.predict(X_trainUnder)) * 100

tuning_results_df = pd.DataFrame(
    data=[["XGBoost Classifer", train_score, test_score]], 
    columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
tuning_results_df

Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,XGBoost Classifer,92.533963,83.036667


### Logistic Regression

In [388]:
#Logistic Regression
model2 = LogisticRegression(solver='liblinear')
model2.fit(X_trainUnder, y_trainUnder)

In [389]:
#Prediction
y_pred = model2.predict(X_test)
output1=pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}\n")
output1

Confusion Matrix: 
 [[24192  4314]
 [  344  1150]]



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.98598,0.210469,0.844733,0.598224,0.947359
recall,0.848663,0.769746,0.844733,0.809205,0.844733
f1-score,0.912183,0.330555,0.844733,0.621369,0.883218
support,28506.0,1494.0,0.844733,30000.0,30000.0


In [390]:
test_score = accuracy_score(y_test, model2.predict(X_test)) * 100
train_score = accuracy_score(y_trainUnder, model2.predict(X_trainUnder)) * 100

results_df_2 = pd.DataFrame(
    data=[["Logistic Regression Classifier", train_score, test_score]], 
    columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
tuning_results_df = tuning_results_df.append(results_df_2, ignore_index=True)
tuning_results_df

  tuning_results_df = tuning_results_df.append(results_df_2, ignore_index=True)


Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,XGBoost Classifer,92.533963,83.036667
1,Logistic Regression Classifier,81.919669,84.473333


### Support Vector machine

In [391]:
#Support Vector machine
model3 = SVC(kernel='rbf', gamma=0.1, C=1.0)
model3.fit(X_trainUnder, y_trainUnder)

In [392]:
#Prediction
y_pred = model3.predict(X_test)
output1=pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}\n")
output1

Confusion Matrix: 
 [[18700  9806]
 [  166  1328]]



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.991201,0.119274,0.6676,0.555238,0.947779
recall,0.656002,0.888889,0.6676,0.772446,0.6676
f1-score,0.789496,0.210326,0.6676,0.499911,0.760653
support,28506.0,1494.0,0.6676,30000.0,30000.0


In [393]:
test_score = accuracy_score(y_test, model3.predict(X_test)) * 100
train_score = accuracy_score(y_trainUnder, model3.predict(X_trainUnder)) * 100

results_df_3 = pd.DataFrame(
    data=[["SVM Classifier", train_score, test_score]], 
    columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
tuning_results_df = tuning_results_df.append(results_df_3, ignore_index=True)
tuning_results_df

  tuning_results_df = tuning_results_df.append(results_df_3, ignore_index=True)


Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,XGBoost Classifer,92.533963,83.036667
1,Logistic Regression Classifier,81.919669,84.473333
2,SVM Classifier,97.30065,66.76


### Random Forest

In [394]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
model4 = RandomForestClassifier(n_estimators=1000, random_state=42)
model4.fit(X_trainUnder, y_trainUnder)

In [395]:
#Prediction
y_pred = model4.predict(X_test)
output1=pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}\n")
output1

Confusion Matrix: 
 [[23557  4949]
 [  234  1260]]



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.990164,0.202931,0.827233,0.596548,0.95096
recall,0.826387,0.843373,0.827233,0.83488,0.827233
f1-score,0.900893,0.327145,0.827233,0.614019,0.87232
support,28506.0,1494.0,0.827233,30000.0,30000.0


In [396]:
test_score = accuracy_score(y_test, model4.predict(X_test)) * 100
train_score = accuracy_score(y_trainUnder, model4.predict(X_trainUnder)) * 100

results_df_4 = pd.DataFrame(
    data=[["Random Forest Classifier", train_score, test_score]], 
    columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
tuning_results_df = tuning_results_df.append(results_df_4, ignore_index=True)
tuning_results_df

  tuning_results_df = tuning_results_df.append(results_df_4, ignore_index=True)


Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,XGBoost Classifer,92.533963,83.036667
1,Logistic Regression Classifier,81.919669,84.473333
2,SVM Classifier,97.30065,66.76
3,Random Forest Classifier,100.0,82.723333


### XGBoost Classifier Hyperparameter Tuning

In [358]:
#XGBoost Classifier Hyperparameter Tuning
param_grid = dict(
    n_estimators=stats.randint(10, 1000),
    max_depth=stats.randint(1, 10),
    learning_rate=stats.uniform(0, 1)
)

xgb_clf = XGBClassifier(use_label_encoder=False)
xgb_cv = RandomizedSearchCV(
    xgb_clf, param_grid, cv=5, n_iter=150, 
    scoring='accuracy', n_jobs=-1, verbose=1
)
xgb_cv.fit(X_trainUnder, y_trainUnder)
best_params = xgb_cv.best_params_
print(f"Best paramters: {best_params}")


model5 = XGBClassifier(**best_params)
model5.fit(X_trainUnder, y_trainUnder)



Fitting 5 folds for each of 150 candidates, totalling 750 fits
Best paramters: {'learning_rate': 0.04187048197828591, 'max_depth': 4, 'n_estimators': 630}


In [398]:
#Prediction
y_pred = model5.predict(X_test)
output1=pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}\n")
output1

Confusion Matrix: 
 [[23841  4665]
 [  238  1256]]



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.990116,0.212126,0.836567,0.601121,0.951372
recall,0.83635,0.840696,0.836567,0.838523,0.836567
f1-score,0.90676,0.338773,0.836567,0.622767,0.878475
support,28506.0,1494.0,0.836567,30000.0,30000.0


In [399]:
test_score = accuracy_score(y_test, model5.predict(X_test)) * 100
train_score = accuracy_score(y_trainUnder, model5.predict(X_trainUnder)) * 100

results_df_5 = pd.DataFrame(
    data=[["Tuned XGBoost Classifier", train_score, test_score]], 
    columns=['Model', 'Training Accuracy %', 'Testing Accuracy %']
)
tuning_results_df = tuning_results_df.append(results_df_5, ignore_index=True)
tuning_results_df

  tuning_results_df = tuning_results_df.append(results_df_5, ignore_index=True)


Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,XGBoost Classifer,92.533963,83.036667
1,Logistic Regression Classifier,81.919669,84.473333
2,SVM Classifier,97.30065,66.76
3,Random Forest Classifier,100.0,82.723333
4,Tuned XGBoost Classifier,87.023036,83.656667


In [400]:
#Final Model for Predictions
model5 = XGBClassifier(use_label_encoder=False,learning_rate=0.04187048197828591,max_depth=4,n_estimators=630)
model5.fit(X_trainUnder, y_trainUnder)



### Prediction on 2017 data

In [408]:
#Prediction Data
pdata = pd.read_csv('C:/BITsPilaniMTECH2123/Interview_Tests/DS_exercise/data_2017_new.csv')
print(f'Shape of X= {pdata.shape}')
pdata.head()

Shape of X= (200000, 55)


Unnamed: 0,masked_customer_id,jaar,Collishop_customer,cat_AP_STDR_PortoONLINE,cat_AP_STDR_WhiskyONLINE,cat_Babyluiers,cat_Ber_Ger_DVPortioneerbaar,cat_Ber_Ger_VersMaaltijdsalades,cat_Bier_Genietbieren,cat_Bloemen,...,cat_nfokay,total_discount,rev_ticket,prod_ticket,n_cogo,cogo_rev,HOUSEHOLDTYPOLOGY,price_sens_colr,SOW_type_colr,SOW_colr
0,339793,2017,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.345,1.0,0,0.0,c_Single_55_plus,-0.25,SOW20-30,27
1,339806,2017,N,0.0,0.0,0.0,0.0,0.0,17.62,0.0,...,0.0,-29.37,29.4925,2.0,0,0.0,g_HHnochild_55_plus,-0.0995,SOW10-20,13
2,339807,2017,N,0.0,0.0,0.0,0.0,2.49,2.04,0.0,...,0.0,-0.97,6.35,2.0,0,0.0,f_HHnochild_35_54,0.0365,SOW20-30,24
3,339809,2017,Y,0.0,0.0,0.0,0.0,3.49,8.99,0.0,...,0.0,0.0,15.297692,2.0,0,0.0,j_HHchild_oldest_6_12,-0.1426,SOW70-80,70
4,339812,2017,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-3.55,6.065263,1.0,0,0.0,g_HHnochild_55_plus,-0.3327,SOW20-30,22


In [409]:
#NA Impute: prod_ticket and cat_Wijn_Stillewijnen_RAYON
pdata['prod_ticket'] = pdata['prod_ticket'].fillna(pdata['prod_ticket'].median())
pdata['cat_Wijn_Stillewijnen_RAYON'] = pdata['cat_Wijn_Stillewijnen_RAYON'].fillna(pdata['cat_Wijn_Stillewijnen_RAYON'].median())
pdata.isnull().sum()

masked_customer_id                 0
jaar                               0
Collishop_customer                 0
cat_AP_STDR_PortoONLINE            0
cat_AP_STDR_WhiskyONLINE           0
cat_Babyluiers                     0
cat_Ber_Ger_DVPortioneerbaar       0
cat_Ber_Ger_VersMaaltijdsalades    0
cat_Bier_Genietbieren              0
cat_Bloemen                        0
cat_Bot_Mar_Boter                  0
cat_BroodKorthoudbaar              0
cat_Chips                          0
cat_ColruytMobile_Toestellen       0
cat_DeegDV                         0
cat_EleKtroKeuken                  0
cat_Houtpelletskolen_briketten     0
cat_Incontinentie_luiers           0
cat_KaasSeizoenskazen              0
cat_Kauwgum                        0
cat_KoudeSauzen                    0
cat_MelkKarnemelk                  0
cat_Notengedroogdfruit_groenten    0
cat_Ontbijtgranen_Volwassenen      0
cat_ParfumerieEHBO                 0
cat_Tapas                          0
cat_Textiel_Bedlinnen              0
c

In [410]:
#Data type conversions
pdata = pdata.convert_dtypes()
int_cols = pdata.select_dtypes(exclude=['string','boolean']).columns
cat_cols = pdata.select_dtypes(include=['string','boolean']).columns
pdata = convert_datatype(pdata,int_cols,cat_cols)

In [411]:
#Encoding
pdata = pd.get_dummies(pdata)

In [413]:
#DR
pdata_umap = um.transform(pdata[cat_features])
pdata_umap = pd.DataFrame(data = pdata_umap, columns = ['umap_f1', 'umap_f2','umap_f3'])
pdata_umap = pdata_umap.reset_index(drop=True)
pdata = pdata.drop(cat_features,axis=1)
pdata = pdata.reset_index(drop=True)
pdata = pd.concat([pdata,pdata_umap],axis=1)
print("Shape Train Independent: ",pdata.shape)

Shape Train Independent:  (200000, 40)


In [422]:
pdata.set_index('masked_customer_id', inplace=True)
pdata = pdata.drop(['jaar'],axis=1)

In [423]:
pdata = pdata[X_test.columns]
pdata.head(2)

Unnamed: 0_level_0,total_discount,rev_ticket,prod_ticket,n_cogo,cogo_rev,price_sens_colr,SOW_colr,Collishop_customer_N,Collishop_customer_Y,HOUSEHOLDTYPOLOGY_!,...,SOW_type_colr_SOW40-50,SOW_type_colr_SOW50-60,SOW_type_colr_SOW60-70,SOW_type_colr_SOW70-80,SOW_type_colr_SOW80-90,SOW_type_colr_SOW90-100,SOW_type_colr_SOW_100+,umap_f1,umap_f2,umap_f3
masked_customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
339793,0,2,1,0,0,0,27,1,0,0,...,0,0,0,0,0,0,0,10.639983,3.081532,4.032839
339806,-29,29,2,0,0,0,13,1,0,0,...,0,0,0,0,0,0,0,5.107592,-1.966157,10.656974


In [490]:
#Prediction
y_pred = model5.predict(pdata)
y_pred2 = model5.predict_proba(pdata)

In [491]:
y_pred[111]

0

In [492]:
y_pred2[111]

array([0.9050983 , 0.09490167], dtype=float32)

In [496]:
y_pred2_df=pd.DataFrame(y_pred2, columns=['No','Yes']) 
print(y_pred2_df.shape)
y_pred2_df.head(2)

(200000, 2)


Unnamed: 0,No,Yes
0,0.994635,0.005365
1,0.274618,0.725382


In [497]:
y_pred.shape

(200000,)

In [522]:
pdata['bought_highbrow_wines'] = y_pred
pdata['masked_customer_id'] = pdata.index
pdata = pdata.reset_index(drop=True)
pdata=pd.concat([pdata,y_pred2_df],axis=1)

In [531]:
pdata.rename(columns={'Yes':'probability'}, inplace=True)
pdata.shape
pdata.head(2)

Unnamed: 0,total_discount,rev_ticket,prod_ticket,n_cogo,cogo_rev,price_sens_colr,SOW_colr,Collishop_customer_N,Collishop_customer_Y,HOUSEHOLDTYPOLOGY_!,...,SOW_type_colr_SOW80-90,SOW_type_colr_SOW90-100,SOW_type_colr_SOW_100+,umap_f1,umap_f2,umap_f3,masked_customer_id,bought_highbrow_wines,No,probability
0,0,2,1,0,0,0,27,1,0,0,...,0,0,0,10.639983,3.081532,4.032839,339793,0,0.994635,0.005365
1,-29,29,2,0,0,0,13,1,0,0,...,0,0,0,5.107592,-1.966157,10.656974,339806,1,0.274618,0.725382


In [532]:
finaldataset = pdata[['masked_customer_id','bought_highbrow_wines','probability']]
finaldataset.tail(3)

Unnamed: 0,masked_customer_id,bought_highbrow_wines,probability
199997,9528111,1,0.994376
199998,9534979,1,0.860769
199999,9535466,1,0.64703


In [535]:
#Save file
finaldataset.to_csv('Abhishek_L1_ML.csv', sep=';',index=False)