In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from numba import jit
import matplotlib.pyplot as plt 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('train.csv.zip')
test_df = pd.read_csv('test.csv.zip')
random_state = 42
np.random.seed(random_state)

In [3]:
train_df.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,total,target
0,id_11149922,5072,1852,25,6447,928147,0.725899,3,2,115,0,44,62,12,1.399162,0
1,id_15609305,4751,2212,6,1813,928147,-1.023377,3,2,215,0,2,110,39,0.286388,0
2,id_5222335,5817,528,2,6447,928147,-0.517814,3,2,55,0,2,0,20,-0.335599,0
3,id_1884252,3088,3682,24,6447,928147,-0.395321,3,2,6,2,4,110,6,-1.516403,0
4,id_12069677,2158,2204,24,6447,928147,0.610663,3,2,62,3,11,59,5,0.908375,0


array([0.07097312, 0.06834591, 0.13779501, ..., 0.05540508, 0.13861348,
       0.04543715])

In [13]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
X_train, X_test, y_train, y_test = train_test_split( train_df.drop(columns=['target','id']), train_df['target'], test_size=0.8, random_state=42, stratify=train_df['target'])
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=300, max_depth=10,random_state=2,n_jobs=4)
rf.fit(X_train,y_train)
print("fit complete")
preds=rf.predict_proba(X_test)
print("pred complete")
fpr, tpr, tresholds = roc_curve(y_test, preds[:,1])
auc_score=auc(fpr, tpr, reorder=True)
print(auc_score)

fit complete
pred complete
0.723452855984088
CPU times: user 25min 30s, sys: 25.1 s, total: 25min 55s
Wall time: 6min 40s


In [23]:
test_size=int(X_test.shape[0]*0.2)

In [24]:
preds=rf.predict_proba(X_test[:test_size])
print("pred complete")
fpr, tpr, tresholds = roc_curve(y_test[:test_size], preds[:,1])
auc_score=auc(fpr, tpr, reorder=True)
print(auc_score)

pred complete
0.7234971342333816


In [18]:
X_test.shape[0]

8771682

In [20]:
int(X_test.shape[0]*0.7)

6140177

In [14]:
test_df.drop(columns=["id"]).head()

Unnamed: 0,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,total
0,4751,2212,6,4475,928147,-1.029709,3,2,215,0,2,0,39,0.286388
1,4751,2212,6,6358,928147,-0.999417,3,2,215,0,2,0,39,0.286388
2,3821,3928,24,6447,928147,-0.615867,3,2,54,0,48,62,32,0.098496
3,2249,4055,4,6447,928147,-0.920644,3,2,86,3,4,110,5,0.232936
4,4751,2212,6,6183,308471,0.562298,3,2,155,3,4,0,5,0.286388


In [15]:
pred_test=rf.predict_proba(test_df.drop(columns=["id"]))
predictions=pred_test[:,1]
sub = pd.DataFrame({"id": test_df.id.values})
sub["target"] = predictions
sub.to_csv("rf_submission.csv", index=False)

In [26]:
%%time
X_train, X_test, y_train, y_test = train_test_split( train_df.drop(columns=['target','id']), train_df['target'], test_size=0.8, random_state=42, stratify=train_df['target'])
_, X_test, _, y_test = train_test_split( X_test, y_test, test_size=0.3, random_state=42, stratify=y_test)

rf=RandomForestClassifier(n_estimators=300, max_depth=10,random_state=2,n_jobs=4)
rf.fit(X_train,y_train)
print("fit complete")
preds=rf.predict_proba(X_test)
print("pred complete")
fpr, tpr, tresholds = roc_curve(y_test, preds[:,1])
auc_score=auc(fpr, tpr, reorder=True)
print(auc_score)

fit complete
pred complete
0.7208057015360523
CPU times: user 11min 16s, sys: 4.22 s, total: 11min 20s
Wall time: 3min 3s


In [66]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
total,0.181063
var_34,0.113839
var_39,0.089055
department,0.085522
product_number,0.083984
var_32,0.077655
var_35,0.069816
category,0.067379
var_33,0.056263
var_37,0.042198


In [27]:
pred_test=rf.predict_proba(test_df.drop(columns=["id"]))
predictions=pred_test[:,1]
sub = pd.DataFrame({"id": test_df.id.values})
sub["target"] = predictions
sub.to_csv("rf_submission_10_percent_of_data.csv", index=False)

In [67]:
# feature generation:
#train_df.groupby("product_number", as_index=False).agg({"target":["sum","mean"]})
grouped_product_number = train_df.groupby('product_number').agg({"target": ["sum", "mean"]}) 
# Using ravel, and a string join, we can create better names for the columns:
grouped_product_number.columns = ["_product_number_".join(x) for x in grouped_product_number.columns.ravel()]
#grouped["product_number"]=grouped.index
grouped_product_number.reset_index(inplace=True)
train_df_with_new_features=train_df.merge(grouped_product_number,how='left')

In [72]:
grouped_creditCard = train_df.groupby('creditCard').agg({"target": ["sum", "mean"]}) 
# Using ravel, and a string join, we can create better names for the columns:
grouped_creditCard.columns = ["_creditCard_".join(x) for x in grouped_creditCard.columns.ravel()]
#grouped["product_number"]=grouped.index
grouped_creditCard.reset_index(inplace=True)
train_df_with_new_features=train_df_with_new_features.merge(grouped_creditCard,how='left')
train_df_with_new_features.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,total,target,target_product_number_sum,target_product_number_mean,target_creditCard_sum,target_creditCard_mean
0,id_11149922,5072,1852,25,6447,928147,0.725899,3,2,115,0,44,62,12,1.399162,0,246,0.003006,422657,0.06054
1,id_15609305,4751,2212,6,1813,928147,-1.023377,3,2,215,0,2,110,39,0.286388,0,140829,0.034183,46,0.103604
2,id_5222335,5817,528,2,6447,928147,-0.517814,3,2,55,0,2,0,20,-0.335599,0,113175,0.063976,422657,0.06054
3,id_1884252,3088,3682,24,6447,928147,-0.395321,3,2,6,2,4,110,6,-1.516403,0,67059,0.081092,422657,0.06054
4,id_12069677,2158,2204,24,6447,928147,0.610663,3,2,62,3,11,59,5,0.908375,0,2948,0.011764,422657,0.06054


In [73]:
grouped_department = train_df.groupby('department').agg({"target": ["sum", "mean"]}) 
# Using ravel, and a string join, we can create better names for the columns:
grouped_department.columns = ["_department_".join(x) for x in grouped_department.columns.ravel()]
#grouped["product_number"]=grouped.index
grouped_department.reset_index(inplace=True)
train_df_with_new_features=train_df_with_new_features.merge(grouped_department,how='left')
train_df_with_new_features.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,...,var_38,var_39,total,target,target_product_number_sum,target_product_number_mean,target_creditCard_sum,target_creditCard_mean,target_department_sum,target_department_mean
0,id_11149922,5072,1852,25,6447,928147,0.725899,3,2,115,...,62,12,1.399162,0,246,0.003006,422657,0.06054,122,0.002747
1,id_15609305,4751,2212,6,1813,928147,-1.023377,3,2,215,...,110,39,0.286388,0,140829,0.034183,46,0.103604,131495,0.033013
2,id_5222335,5817,528,2,6447,928147,-0.517814,3,2,55,...,0,20,-0.335599,0,113175,0.063976,422657,0.06054,113175,0.063976
3,id_1884252,3088,3682,24,6447,928147,-0.395321,3,2,6,...,110,6,-1.516403,0,67059,0.081092,422657,0.06054,60934,0.09707
4,id_12069677,2158,2204,24,6447,928147,0.610663,3,2,62,...,59,5,0.908375,0,2948,0.011764,422657,0.06054,2948,0.011764


In [74]:
grouped_category = train_df.groupby('category').agg({"target": ["sum", "mean"]}) 
# Using ravel, and a string join, we can create better names for the columns:
grouped_category.columns = ["_category_".join(x) for x in grouped_category.columns.ravel()]
#grouped["product_number"]=grouped.index
grouped_category.reset_index(inplace=True)
train_df_with_new_features=train_df_with_new_features.merge(grouped_category,how='left')
train_df_with_new_features.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,...,total,target,target_product_number_sum,target_product_number_mean,target_creditCard_sum,target_creditCard_mean,target_department_sum,target_department_mean,target_category_sum,target_category_mean
0,id_11149922,5072,1852,25,6447,928147,0.725899,3,2,115,...,1.399162,0,246,0.003006,422657,0.06054,122,0.002747,874,0.010719
1,id_15609305,4751,2212,6,1813,928147,-1.023377,3,2,215,...,0.286388,0,140829,0.034183,46,0.103604,131495,0.033013,162860,0.036206
2,id_5222335,5817,528,2,6447,928147,-0.517814,3,2,55,...,-0.335599,0,113175,0.063976,422657,0.06054,113175,0.063976,129736,0.064812
3,id_1884252,3088,3682,24,6447,928147,-0.395321,3,2,6,...,-1.516403,0,67059,0.081092,422657,0.06054,60934,0.09707,183586,0.053251
4,id_12069677,2158,2204,24,6447,928147,0.610663,3,2,62,...,0.908375,0,2948,0.011764,422657,0.06054,2948,0.011764,183586,0.053251


In [4]:
def mean_encoding(data,columns,target_col):
    train_new=data.copy()
    for column in columns:
        train_new[column + "_mean_target"] = None
    y = data[target_col].values
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    X=data.drop(columns=[target_col]).values

    for training_index, validation_index in  skf.split(X,y):
        x_train = data.iloc[training_index]
        x_validation = data.iloc[validation_index]
        # 'columns' is a list of columns to encode
        for column in columns:
            means = x_validation[column].map(x_train.groupby(column)[target_col].mean())
            x_validation[column + "_mean_target"] = means
        # train_new is a dataframe copy we made of the training data
        
        train_new.iloc[validation_index] = x_validation

    global_mean = data[target_col].mean()

    # replace nans with the global mean
    train_new.fillna(global_mean, inplace=True)
    return train_new
new_data=mean_encoding(train_df,['product_number', 'department', 'category', 'creditCard','customer', 'var_33', 'var_34', 'var_35', 'var_36', 'var_37','var_38', 'var_39'],"target")
new_data.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,...,category_mean_target,creditCard_mean_target,customer_mean_target,var_33_mean_target,var_34_mean_target,var_35_mean_target,var_36_mean_target,var_37_mean_target,var_38_mean_target,var_39_mean_target
0,id_11149922,5072,1852,25,6447,928147,0.725899,3,2,115,...,0.010425,0.060546,0.051885,0.046608,0.046534,0.031729,0.047143,0.033316,0.057354,0.039927
1,id_15609305,4751,2212,6,1813,928147,-1.023377,3,2,215,...,0.036223,0.110795,0.051882,0.046599,0.046526,0.028288,0.047108,0.04975,0.072953,0.032318
2,id_5222335,5817,528,2,6447,928147,-0.517814,3,2,55,...,0.064893,0.060546,0.051885,0.046608,0.046534,0.057632,0.047143,0.049685,0.058544,0.057192
3,id_1884252,3088,3682,24,6447,928147,-0.395321,3,2,6,...,0.053253,0.060526,0.051856,0.046606,0.046533,0.079156,0.097923,0.076529,0.072841,0.072297
4,id_12069677,2158,2204,24,6447,928147,0.610663,3,2,62,...,0.053297,0.060553,0.051873,0.046592,0.046521,0.033989,0.043381,0.046724,0.053007,0.06525


In [5]:
new_data.to_csv("train_with_mean_encoded_features.csv",index=False)

In [8]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split( new_data.drop(columns=['target','id']), new_data['target'], test_size=0.8, random_state=42, stratify=new_data['target'])
_, X_test, _, y_test = train_test_split( X_test, y_test, test_size=0.3, random_state=42, stratify=y_test)

rf=RandomForestClassifier(n_estimators=300, max_depth=10,random_state=2,n_jobs=4)
rf.fit(X_train,y_train)
print("fit complete")
preds=rf.predict_proba(X_test)
print("pred complete")
fpr, tpr, tresholds = roc_curve(y_test, preds[:,1])
auc_score=auc(fpr, tpr, reorder=True)
print(auc_score)

fit complete
pred complete
0.7430996589406
CPU times: user 27min 55s, sys: 11.7 s, total: 28min 7s
Wall time: 7min 21s


In [9]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
department_mean_target,0.205059
product_number_mean_target,0.163392
var_35_mean_target,0.127769
creditCard_mean_target,0.10356
total,0.044438
var_39_mean_target,0.036567
customer_mean_target,0.036106
var_34_mean_target,0.035176
var_32,0.034613
var_37_mean_target,0.02115


In [13]:
new_data = pd.read_csv('train_with_mean_encoded_features.csv')
new_data.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,...,category_mean_target,creditCard_mean_target,customer_mean_target,var_33_mean_target,var_34_mean_target,var_35_mean_target,var_36_mean_target,var_37_mean_target,var_38_mean_target,var_39_mean_target
0,id_11149922,5072,1852,25,6447,928147,0.725899,3,2,115,...,0.010425,0.060546,0.051885,0.046608,0.046534,0.031729,0.047143,0.033316,0.057354,0.039927
1,id_15609305,4751,2212,6,1813,928147,-1.023377,3,2,215,...,0.036223,0.110795,0.051882,0.046599,0.046526,0.028288,0.047108,0.04975,0.072953,0.032318
2,id_5222335,5817,528,2,6447,928147,-0.517814,3,2,55,...,0.064893,0.060546,0.051885,0.046608,0.046534,0.057632,0.047143,0.049685,0.058544,0.057192
3,id_1884252,3088,3682,24,6447,928147,-0.395321,3,2,6,...,0.053253,0.060526,0.051856,0.046606,0.046533,0.079156,0.097923,0.076529,0.072841,0.072297
4,id_12069677,2158,2204,24,6447,928147,0.610663,3,2,62,...,0.053297,0.060553,0.051873,0.046592,0.046521,0.033989,0.043381,0.046724,0.053007,0.06525


In [14]:
new_data["var_32_round_2"]=round(new_data.var_32,2)
new_data["var_32_round_3"]=round(new_data.var_32,3)

In [15]:
new_data.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,...,customer_mean_target,var_33_mean_target,var_34_mean_target,var_35_mean_target,var_36_mean_target,var_37_mean_target,var_38_mean_target,var_39_mean_target,var_32_round_2,var_32_round_3
0,id_11149922,5072,1852,25,6447,928147,0.725899,3,2,115,...,0.051885,0.046608,0.046534,0.031729,0.047143,0.033316,0.057354,0.039927,0.73,0.726
1,id_15609305,4751,2212,6,1813,928147,-1.023377,3,2,215,...,0.051882,0.046599,0.046526,0.028288,0.047108,0.04975,0.072953,0.032318,-1.02,-1.023
2,id_5222335,5817,528,2,6447,928147,-0.517814,3,2,55,...,0.051885,0.046608,0.046534,0.057632,0.047143,0.049685,0.058544,0.057192,-0.52,-0.518
3,id_1884252,3088,3682,24,6447,928147,-0.395321,3,2,6,...,0.051856,0.046606,0.046533,0.079156,0.097923,0.076529,0.072841,0.072297,-0.4,-0.395
4,id_12069677,2158,2204,24,6447,928147,0.610663,3,2,62,...,0.051873,0.046592,0.046521,0.033989,0.043381,0.046724,0.053007,0.06525,0.61,0.611


In [16]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split( new_data.drop(columns=['target','id']), new_data['target'], test_size=0.8, random_state=42, stratify=new_data['target'])
_, X_test, _, y_test = train_test_split( X_test, y_test, test_size=0.3, random_state=42, stratify=y_test)

rf=RandomForestClassifier(n_estimators=300, max_depth=10,random_state=2,n_jobs=4)
rf.fit(X_train,y_train)
print("fit complete")
preds=rf.predict_proba(X_test)
print("pred complete")
fpr, tpr, tresholds = roc_curve(y_test, preds[:,1])
auc_score=auc(fpr, tpr, reorder=True)
print(auc_score)

fit complete
pred complete
0.7433534018493639
CPU times: user 29min 58s, sys: 10.2 s, total: 30min 8s
Wall time: 7min 53s


In [17]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
department_mean_target,0.193211
product_number_mean_target,0.163009
var_35_mean_target,0.130563
creditCard_mean_target,0.100426
total,0.044588
var_34_mean_target,0.037021
customer_mean_target,0.03441
var_39_mean_target,0.033388
var_32,0.02242
var_37_mean_target,0.02169


In [None]:
mean_encoded_cols=['product_number', 'department', 'category', 'creditCard','customer', 'var_33', 'var_34', 'var_35', 'var_36', 'var_37','var_38', 'var_39']

In [22]:
mean_encoded_cols=['product_number', 'department', 'category', 'creditCard','customer', 'var_33', 'var_34', 'var_35', 'var_36', 'var_37','var_38', 'var_39']
test_df_with_mean_encoded_features=test_df.copy()
for col in mean_encoded_cols:
    grouped_p=new_data.groupby(col,as_index=False).agg({col+"_mean_target":"mean"})
    test_df_with_mean_encoded_features=test_df_with_mean_encoded_features.merge(grouped_p,how='left')
test_df_with_mean_encoded_features.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,...,category_mean_target,creditCard_mean_target,customer_mean_target,var_33_mean_target,var_34_mean_target,var_35_mean_target,var_36_mean_target,var_37_mean_target,var_38_mean_target,var_39_mean_target
0,id_5007385,4751,2212,6,4475,928147,-1.029709,3,2,215,...,0.036206,0.080688,0.051875,0.046605,0.046532,0.02801,0.047125,0.049751,0.058593,0.032285
1,id_12558699,4751,2212,6,6358,928147,-0.999417,3,2,215,...,0.036206,0.007226,0.051875,0.046605,0.046532,0.02801,0.047125,0.049751,0.058593,0.032285
2,id_5454443,3821,3928,24,6447,928147,-0.615867,3,2,54,...,0.053251,0.06054,0.051875,0.046605,0.046532,0.013189,0.047125,0.040222,0.057124,0.016354
3,id_101507,2249,4055,4,6447,928147,-0.920644,3,2,86,...,0.092376,0.06054,0.051875,0.046605,0.046532,0.054869,0.043423,0.076532,0.072894,0.065203
4,id_11803238,4751,2212,6,6183,308471,0.562298,3,2,155,...,0.036206,0.061219,0.166667,0.046605,0.046532,0.081442,0.043423,0.076532,0.058593,0.065203


In [27]:
test_df_with_mean_encoded_features.fillna(train_df.target.mean(),inplace=True)

In [29]:
test_df_with_mean_encoded_features["var_32_round_2"]=round(test_df_with_mean_encoded_features.var_32,2)
test_df_with_mean_encoded_features["var_32_round_3"]=round(test_df_with_mean_encoded_features.var_32,3)

In [30]:
print(test_df_with_mean_encoded_features.shape)
print(X_train.shape)

(5220160, 29)
(2192920, 28)


In [32]:
pred_test=rf.predict_proba(test_df_with_mean_encoded_features.drop(columns=["id"]))
predictions=pred_test[:,1]
sub = pd.DataFrame({"id": test_df_with_mean_encoded_features.id.values})
sub["target"] = predictions
sub.to_csv("mean_encoded_features_rf_submission_10_percent_of_data.csv", index=False)

In [33]:
test_df_with_mean_encoded_features.to_csv("test_df_with_mean_encoded_features.csv",index=False)

NameError: name 'X_train' is not defined

In [None]:
rf=RandomForestClassifier(n_estimators=1000,random_state=2,n_jobs=4)
rf.fit(X_train,y_train)
print("fit complete")
preds=rf.predict_proba(X_test)
print("pred complete")
fpr, tpr, tresholds = roc_curve(y_test, preds[:,1])
auc_score=auc(fpr, tpr, reorder=True)
print(auc_score)

pred_test=rf.predict_proba(test_df_with_mean_encoded_features.drop(columns=["id"]))
predictions=pred_test[:,1]
sub = pd.DataFrame({"id": test_df_with_mean_encoded_features.id.values})
sub["target"] = predictions
sub.to_csv("mean_encoded_features_rf_submission_10_percent_of_data.csv", index=False)