In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_column', None)

#### Data

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [3]:
train.head(3)

Unnamed: 0,ID,Year_of_Birth,Education_Level,Marital_Status,Disposable_Income,No_of_Kids_in_home,No_of_Teen_in_home,Date_Customer,Recency,Discounted_Purchases,WebPurchases,CatalogPurchases,StorePurchases,Amount_on_Wines,Amount_on_Fruits,Amount_on_MeatProducts,Amount_on_FishProducts,Amount_on_SweetProducts,Amount_on_GoldProds,WebVisitsMonth,Cmp3Accepted,Cmp4Accepted,Cmp5Accepted,Cmp1Accepted,Cmp2Accepted,Any_Complain,Response
0,ID_4A9AR6FQ,1955,Graduation,Divorced,77504.4,1,1,22-06-2014,56,2,1,1,4,71,3,16,0,6,20,3,0,0,0,0,0,0,0
1,ID_X28T3VEK,1958,Graduation,Together,56784.0,0,1,01-08-2013,17,6,5,1,8,205,22,111,50,15,19,6,0,0,0,0,0,0,0
2,ID_AWXARH57,1962,Graduation,Single,103714.8,0,0,21-11-2013,17,0,7,4,7,469,31,873,29,18,37,2,0,0,0,1,0,0,1


In [4]:
print(f"train shape {train.shape}\ntest shape {test.shape}")

train shape (1568, 27)
test shape (672, 26)


#### Feature Engineering

In [4]:
train.insert(1, 'Age', 2021 - train['Year_of_Birth'])
test.insert(1, 'Age', 2021 - test['Year_of_Birth'])

In [5]:
#Took a log of variables
cols = ['Disposable_Income', 'Recency', 'Amount_on_GoldProds','Amount_on_SweetProducts',
       'Amount_on_Wines', 'Amount_on_Fruits','Amount_on_MeatProducts','Amount_on_FishProducts', 'Age','Year_of_Birth']


for x in cols:
    train[x] = np.log1p(train[x])
    test[x] = np.log1p(test[x])

In [6]:
#Converted date column to date and time
train['Date_Customer'] = pd.to_datetime(train['Date_Customer'])
test['Date_Customer'] = pd.to_datetime(test['Date_Customer'])

#Generate Year Column
train.insert(7, 'YearEnrolled', train['Date_Customer'].dt.year)
test.insert(7, 'YearEnrolled', test['Date_Customer'].dt.year)

#Generate Year Month column
train.insert(8, 'MonthEnrolled', train['Date_Customer'].dt.month)
test.insert(8, 'MonthEnrolled', test['Date_Customer'].dt.month)

#Generate Year Quarterly column
train.insert(8, 'QuarterEnrolled', train['Date_Customer'].dt.quarter)
test.insert(8, 'QuarterEnrolled', test['Date_Customer'].dt.quarter)

In [7]:
#Generate total spending Column
train.insert(10, "Total Spending", train['Discounted_Purchases'] + train['WebPurchases'] + train['CatalogPurchases'] +
            train['StorePurchases'] + train['Amount_on_Wines'] + train['Amount_on_Fruits'] + train['Amount_on_MeatProducts'] +
            train['Amount_on_FishProducts'] + train['Amount_on_SweetProducts'] + train['Amount_on_GoldProds'])

test.insert(10, "Total Spending", test['Discounted_Purchases'] + test['WebPurchases'] + test['CatalogPurchases'] +
            test['StorePurchases'] + test['Amount_on_Wines'] + test['Amount_on_Fruits'] + test['Amount_on_MeatProducts'] +
            test['Amount_on_FishProducts'] + test['Amount_on_SweetProducts'] + test['Amount_on_GoldProds'])

#Generate total spending/1000 Column
train.insert(11, 'Total Spending/100', train['Total Spending'] / 100) 
test.insert(11, 'Total Spending/100', test['Total Spending'] / 100) 

train.insert(11, 'Total Spending/1000', train['Total Spending'] / 1000) 
test.insert(11, 'Total Spending/1000', test['Total Spending'] / 1000) 

#Generate total disposable/100 Column
train.insert(11, 'Disposable/100', train['Disposable_Income'] / 100) 
test.insert(11, 'Disposable/100', test['Disposable_Income'] / 100) 

In [8]:
train.head(3)

Unnamed: 0,ID,Age,Year_of_Birth,Education_Level,Marital_Status,Disposable_Income,No_of_Kids_in_home,YearEnrolled,QuarterEnrolled,MonthEnrolled,Total Spending,Disposable/100,Total Spending/1000,Total Spending/100,No_of_Teen_in_home,Date_Customer,Recency,Discounted_Purchases,WebPurchases,CatalogPurchases,StorePurchases,Amount_on_Wines,Amount_on_Fruits,Amount_on_MeatProducts,Amount_on_FishProducts,Amount_on_SweetProducts,Amount_on_GoldProds,WebVisitsMonth,Cmp3Accepted,Cmp4Accepted,Cmp5Accepted,Cmp1Accepted,Cmp2Accepted,Any_Complain,Response
0,ID_4A9AR6FQ,4.204693,7.578657,Graduation,Divorced,11.258103,1,2014,2,6,21.486606,0.112581,0.021487,0.214866,1,2014-06-22,4.043051,2,1,1,4,4.276666,1.386294,2.833213,0.0,1.94591,3.044522,3,0,0,0,0,0,0,0
1,ID_X28T3VEK,4.158883,7.580189,Graduation,Together,10.947027,0,2013,1,1,42.882016,0.10947,0.042882,0.42882,1,2013-01-08,2.890372,6,5,1,8,5.327876,3.135494,4.718499,3.931826,2.772589,2.995732,6,0,0,0,0,0,0,0
2,ID_AWXARH57,4.094345,7.582229,Graduation,Single,11.54941,0,2013,4,11,44.374771,0.115494,0.044375,0.443748,0,2013-11-21,2.890372,0,7,4,7,6.152733,3.465736,6.77308,3.401197,2.944439,3.637586,2,0,0,0,1,0,0,1


#### Modelling

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
X = train.drop(['ID', 'Response', 'Date_Customer','No_of_Teen_in_home', 'Year_of_Birth'], axis = 1)
y = train.Response
xtest = test.drop(['ID','Date_Customer','No_of_Teen_in_home', 'Year_of_Birth'], axis = 1)


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, stratify = y, random_state = 2000)

In [14]:
#Pip Install Feature_engine And mlxtend
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import CountFrequencyEncoder, OneHotEncoder
from feature_engine.outliers import Winsorizer
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PolynomialFeatures
from feature_engine.selection import RecursiveFeatureElimination,SelectBySingleFeaturePerformance
from feature_engine.encoding import RareLabelEncoder

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import  GradientBoostingClassifier,RandomForestClassifier, VotingClassifier, StackingClassifier, AdaBoostClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import Lasso, RidgeClassifier, LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
import lightgbm as lgbm
import xgboost as xgb
from catboost import CatBoostClassifier


#Filling Missing Values
mean_imputer = MeanMedianImputer(imputation_method = 'mean')
median_imputer = MeanMedianImputer(imputation_method = 'median')
cat_imputer = CategoricalImputer(imputation_method = 'frequent')
cat_imputer_2 = CategoricalImputer(imputation_method = 'missing')

#Outliers
capper = Winsorizer(capping_method='iqr', tail='right', fold=3, variables = ['Age'])

#Hadling categorical variables
fre_enc = CountFrequencyEncoder(encoding_method='frequency', variables =['Education_Level','Marital_Status', 'Year_of_Birth'],
                               ignore_format = True)
one_enc = OneHotEncoder(variables =['Education_Level','Marital_Status',], ignore_format = True, drop_last = True)
rare_enc = RareLabelEncoder(tol= 0.05, n_categories = 5, variables = ['Discounted_Purchases', 'WebPurchases','CatalogPurchases',
                    'WebVisitsMonth','StorePurchases'], replace_with = 0, ignore_format = True)

#scaling
scale = QuantileTransformer()
sd = StandardScaler()
#PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only=True, include_bias=False)


#Model Initialization
lgb_model = lgbm.LGBMClassifier(learning_rate = 0.1,num_leaves = 5, n_estimators = 30, class_weight = {0:0.3,1:0.7 },
                               colsample_bytree = 1)
xgb_model = xgb.XGBClassifier(eval_metric ='logloss',objective = 'binary:logistic', learning_rate = 0.05, n_estimators = 20,
                              max_depth = 5, scale_pos_weight=1)
gbt_model = GradientBoostingClassifier(n_estimators= 200, learning_rate= 0.01,max_depth= 3, random_state=0)
cat_model = CatBoostClassifier(verbose=False,learning_rate = 0.01,iterations = 1000, max_depth = 6)
lr_model = LogisticRegression(C= 100,  penalty = 'l2', class_weight = {0:0.3,1:0.7})
rf_model = RandomForestClassifier(n_estimators = 10,)
ridge_model = RidgeClassifier(alpha = 100, class_weight = {0:0.1,1:0.9})
ada_model = AdaBoostClassifier(n_estimators = 50 )
hgb_model = HistGradientBoostingClassifier(   loss='auto',
    learning_rate=0.05,
    max_iter= 100,
    max_leaf_nodes=31,
    max_depth=4,
    min_samples_leaf=20,
    l2_regularization=0.9,
    max_bins=255,
    monotonic_cst=None,
    warm_start= True,
    early_stopping='auto',
    scoring='loss',
    validation_fraction=0.1,
    n_iter_no_change=10,
    tol=1e-07,
    verbose=0,
    random_state=None,)


#feature selection
tr = SelectBySingleFeaturePerformance(estimator=lr_model, scoring="f1", threshold=0.09, cv=5)

### Pipelines

In [15]:
pipe = Pipeline(
    [('mean_imputer', mean_imputer),
     ('one_enc', one_enc),
     #('capper', capper),
     #('poly', poly),
     ('rare', rare_enc),
     #('sd', sd),
     ('model', lgb_model),
    ]
)

pipe1 = Pipeline(
    [('mean_imputer', mean_imputer),
     ('fre_enc', one_enc),
     #('poly', poly),
     ('sd', sd),
     ('model', cat_model)
    ]
)

pipe2 = Pipeline(
    [('mean_imputer', mean_imputer),
     ('one_enc', one_enc),
     # ('poly', poly),
     ('sd', sd),
     ('model', xgb_model)
    ]
)

pipe3 = Pipeline(
    [('mean_imputer', mean_imputer),
     ('one_enc', one_enc),
     #('poly', poly),
     #('rare', rare_enc),
     ('sd', scale),
     ('model', lr_model)
    ]
)

pipe4 = Pipeline(
    [('mean_imputer', median_imputer),
      ('fre_enc', one_enc),
      ('sd', sd),
     #('poly', poly),
     ('model', ada_model)
    ]
)


pipe5 = Pipeline(
    [('mean_imputer', mean_imputer),
     ('one_enc', one_enc),
     #('capper', capper),
     #('poly', poly),
     ('sd', sd),
     ('model', gbt_model),
    ]
)


pipe6 = Pipeline(
    [('mean_imputer', mean_imputer),
     ('one_enc', one_enc),
     #('capper', capper),
     #('poly', poly),
     ('sd', sd),
     ('model', hgb_model),
    ]
)

### voting Classifier

In [16]:
vote_model = VotingClassifier(estimators = [('pipe', pipe), ('pipe4', pipe4), ('pipe6', pipe1),
                                            ('pipe1', pipe3)], 
                              voting = 'hard', 
                              weights = [0.1, 0.2, 0.1, 0.6]
                             )

In [17]:
vote_model.fit(x_train, y_train)

VotingClassifier(estimators=[('pipe',
                              Pipeline(steps=[('mean_imputer',
                                               MeanMedianImputer(imputation_method='mean')),
                                              ('one_enc',
                                               OneHotEncoder(drop_last=True,
                                                             ignore_format=True,
                                                             variables=['Education_Level',
                                                                        'Marital_Status'])),
                                              ('rare',
                                               RareLabelEncoder(ignore_format=True,
                                                                n_categories=5,
                                                                replace_with=0,
                                                                variables=['Discounted_Purchases',
        

In [18]:
train_pred = vote_model.predict(x_train) 
test_pred = vote_model.predict(x_test)
print(f"AUC score on the train set is {f1_score(y_train, train_pred)}\nAUC score on the test set is {f1_score(y_test, test_pred)} ")

AUC score on the train set is 0.6782608695652174
AUC score on the test set is 0.6521739130434783 


### Cross Validation

In [19]:
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold, cross_val_score
kf = KFold(n_splits = 5, shuffle = True, random_state = 2000)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2000)

scores = cross_val_score(vote_model, X, y, scoring = 'f1', cv = skf)
print(f'scores == {scores}')
print(f"Mean f1 score is {scores.mean()}")

scores == [0.65486726 0.6407767  0.57731959 0.64583333 0.57425743]
Mean f1 score is 0.6186108604742135


In [21]:
# k-fold cross validation
scores = list()
kfold = KFold(n_splits=5, shuffle=True, random_state = 2000)
# enumerate splits
for train_ix, test_ix in kf.split(X, y):
    # get data
    train_X, test_X = X.iloc[train_ix], X.iloc[test_ix]
    train_y, test_y = y[train_ix], y[test_ix]
    # fit model
    vote_model.fit(train_X, train_y)
    # evaluate model
    yhat = vote_model.predict(test_X)
    acc = f1_score(test_y, yhat)
    # store score 
    scores.append(acc)
    print('> ', acc)
# summarize model performance
mean_s, std_s = np.mean(scores), np.std(scores)
print('Mean: %.3f, Standard Deviation: %.3f' % (mean_s, std_s))

>  0.594059405940594
>  0.656
>  0.6222222222222223
>  0.6732673267326733
>  0.6041666666666667
Mean: 0.630, Standard Deviation: 0.030


#### Fit on Entire Data

In [22]:
vote_model.fit(X, y)

VotingClassifier(estimators=[('pipe',
                              Pipeline(steps=[('mean_imputer',
                                               MeanMedianImputer(imputation_method='mean')),
                                              ('one_enc',
                                               OneHotEncoder(drop_last=True,
                                                             ignore_format=True,
                                                             variables=['Education_Level',
                                                                        'Marital_Status'])),
                                              ('rare',
                                               RareLabelEncoder(ignore_format=True,
                                                                n_categories=5,
                                                                replace_with=0,
                                                                variables=['Discounted_Purchases',
        

In [23]:
sub_test = vote_model.predict(xtest)

In [24]:
sub = pd.DataFrame({'ID': test['ID'], 'Response': sub_test})
sub.head(3)

Unnamed: 0,ID,Response
0,ID_ZPMABNVX,0
1,ID_WFE91NAA,0
2,ID_JV11RBRK,0


In [25]:
sub.Response.value_counts()

0    568
1    104
Name: Response, dtype: int64

In [26]:
sub.to_csv('vote_new1.csv', index = False)