## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pylab

In [2]:
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score

## Reading Data

In [5]:
train=pd.read_csv('Training_Dataset.csv')
test=pd.read_csv('Leaderboard_Dataset.csv')
final= pd.read_csv('Final_Dataset.csv')
full_data=[train,test,final]

In [6]:
dictionary=pd.read_csv('Data_Dictionary .csv')

In [7]:
print train.shape
print test.shape

(40000, 52)
(10000, 46)


__Variable Analysis__

In [8]:
print dictionary

   Variable                                        Description
0    cm_key                         Unique customer identifier
1     mvar1                                  Card product type
2     mvar2                                        Family Size
3     mvar3  Customer spending capacity. 0 = No data available
4     mvar4  Number of total cards (including estimated ext...
5     mvar5      Number of months the Account has been set up.
6     mvar6   Total club memberhship fees in the last one year
7     mvar7     Internal score for affinity towards high spend
8     mvar8                          Internal influencer score
9     mvar9                      Income. 0 = No data available
10   mvar10                            Platinum card indicator
11   mvar11  Internal probability score for affinity toward...
12   mvar12  Industry code in which the customer has spent ...
13   mvar13  Number of times the customer has made payments...
14   mvar14                         Number of club memb

In [9]:
## Forming a DataFrame for exploring each variable
df1=pd.DataFrame()
for i in train.columns:
    A=train[i].dtypes
    B=train[i].unique()
    C=np.sum(train[i].isnull())
    df=pd.DataFrame([[i,A,B,C]],columns=['variable','dtype','unique','null_values'])
    df1=pd.concat([df1,df])
df1    

Unnamed: 0,variable,dtype,unique,null_values
0,cm_key,int64,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0
0,mvar1,object,[Charge ],0
0,mvar2,int64,"[0, 2, 5, 1, 4, 3, 6, 8, 9, 10, 11, 7]",0
0,mvar3,int64,"[106920, 0, 399168, 267300, 213840, 25126, 149...",0
0,mvar4,int64,"[4, 6, 1, 3, 0, 5, 2, 8, 7, 12, 9, 10, 11]",0
0,mvar5,int64,"[18, 19, 35, 274, 1, 127, 320, 2, 129, 131, 93...",0
0,mvar6,int64,"[970, 1030, 327, 1258, 0, 381, 490, 981, 207, ...",0
0,mvar7,float64,"[0.7969, 0.0678, 0.7551, 2.3238, 0.1431, 0.104...",0
0,mvar8,float64,"[0.1831, 0.0, 0.2986, 0.5297, 0.4817, 0.2682, ...",0
0,mvar9,int64,"[337236, 2918974, 498656, 2480074, 48246, 2259...",0


__'cm_key' is just a unique id for customers, so we won't be using that in our model,<br>
'mvar1' does not contribute to the data as it has only one variable, therefore we shall drop it,moreover 'mvar9' and 'mvar3' have null values wherever there values are zero(as mentioned in data dictionary).Apart from that there are no null values.__

In [10]:
print np.mean(train.mvar9==0)
print np.mean(train.mvar3==0)

0.040125
0.60885


__More than 50% data is missing for 'mvar3', therefore it is wise to drop this variable__

In [11]:
for data in full_data:
    data.drop(['mvar1','mvar3'],axis=1,inplace=True)

__'mvar49','mvar50','mvar51' are target variables, converting them to only one variable 'offer'__

In [12]:
train['offer']=0
train.loc[train.mvar49>0,'offer']=1
train.loc[train.mvar50>0,'offer']=2
train.loc[train.mvar51>0,'offer']=3

__Imputing null values for 'mvar9' using Random Forest Regressor__

In [13]:
regr=train[train['mvar9']!=0]
clf_reg=RandomForestRegressor(n_estimators=500,max_depth=12)
clf_reg.fit(regr[['mvar2','mvar5','mvar6','mvar7','mvar8','mvar11']].values, regr['mvar9'].values)
df=train[train.mvar9==0]
train.loc[train.mvar9==0,'mvar9']=clf_reg.predict(df[['mvar2','mvar5','mvar6','mvar7','mvar8','mvar11']].values)
df_t=test[test.mvar9==0]
test.loc[test.mvar9==0,'mvar9']=clf_reg.predict(df_t[['mvar2','mvar5','mvar6','mvar7','mvar8','mvar11']].values)

## Feature Transformation and Feature Engineering

__Transformations and most of the new features created are inferred from visualizations( available in 'Visualizations' notebook)__

Transforming skewed continous features by taking log or roots or natural number as seem fit to make it uniform in manner.

In [14]:
train['mvar6']=train['mvar6']**0.4
train['mvar7']=train['mvar7']**0.15
train['mvar9']=train['mvar9']**0.15
train['mvar16']=np.log(train.mvar16+1)
train['mvar17']=np.log(train.mvar17+1)
train['mvar18']=np.log(train.mvar18+1)
train['mvar19']=np.log(train.mvar19+1)
train['mvar20']=np.log(train.mvar20+1)
train['mvar21']=np.log(train.mvar21+1)
train['mvar22']=np.log(train.mvar22+1)
train['mvar23']=np.log(train.mvar23+1)
train['mvar24']=np.log(train.mvar24+1)
train['mvar25']=np.log(train.mvar25+1)
train['mvar26']=np.log(train.mvar26+1)
train['mvar27']=np.log(train.mvar27+1)
train['mvar28']=np.log(train.mvar28+1)
train['mvar29']=np.log(train.mvar29+1)
train['mvar30']=np.log(train.mvar30+1)
train['mvar31']=np.log(train.mvar31+1)
train['mvar36']=np.log(train.mvar36+1)
train['mvar37']=np.log(train.mvar37+1)
train['mvar38']=np.log(train.mvar38+1)
train['mvar39']=np.log(train.mvar39+1)

In [15]:
test['mvar6']=test['mvar6']**0.4
test['mvar7']=test['mvar7']**0.15
test['mvar9']=test['mvar9']**0.15
test['mvar16']=np.log(test.mvar16+1)
test['mvar17']=np.log(test.mvar17+1)
test['mvar18']=np.log(test.mvar18+1)
test['mvar19']=np.log(test.mvar19+1)
test['mvar20']=np.log(test.mvar20+1)
test['mvar21']=np.log(test.mvar21+1)
test['mvar22']=np.log(test.mvar22+1)
test['mvar23']=np.log(test.mvar23+1)
test['mvar24']=np.log(test.mvar24+1)
test['mvar25']=np.log(test.mvar25+1)
test['mvar26']=np.log(test.mvar26+1)
test['mvar27']=np.log(test.mvar27+1)
test['mvar28']=np.log(test.mvar28+1)
test['mvar29']=np.log(test.mvar29+1)
test['mvar30']=np.log(test.mvar30+1)
test['mvar31']=np.log(test.mvar31+1)
test['mvar36']=np.log(test.mvar36+1)
test['mvar37']=np.log(test.mvar37+1)
test['mvar38']=np.log(test.mvar38+1)
test['mvar39']=np.log(test.mvar39+1)

In [16]:
##No. of family members
for data in full_data:
    data['mvar2']=data['mvar2']+1

__As 'mvar12' contains 18 different categories, we first convert it to dummy variables then apply PCA on it to transform it to only one variable. We will be using all of the data i.e. training, test and final_dataset to perform our PCA.__

In [17]:
pc_1=pd.read_csv('Training_Dataset.csv')
pc_2=pd.read_csv('Leaderboard_Dataset.csv')
pc_3=pd.read_csv('Final_Dataset.csv')
dd_1=pd.DataFrame(pc_1['mvar12'])
dd_2=pd.DataFrame(pc_2['mvar12'])
dd_3=pd.DataFrame(pc_3['mvar12'])

In [18]:
pc_11=pd.get_dummies(dd_1)
pc_22=pd.get_dummies(dd_2)
pc_33=pd.get_dummies(dd_3)

In [19]:
pc=pd.concat([pc_11,pc_22,pc_33])
pca=PCA(n_components=1)
pca.fit(pc.values)

PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [20]:
train['new_1']=pca.transform(pc_11.values)
test['new_1']=pca.transform(pc_22.values)
final['new_1']=pca.transform(pc_33.values)

__Similarly we'll transform mvar40,mvar41,mvar42 and mvar43,mvar44,mvar45 into one variable each via PCA__

In [21]:
ee_1=pd.DataFrame(pc_1[['mvar40','mvar41','mvar42']])
ee_2=pd.DataFrame(pc_2[['mvar40','mvar41','mvar42']])
ee_3=pd.DataFrame(pc_3[['mvar40','mvar41','mvar42']])
pce=pd.concat([ee_1,ee_2,ee_3])

In [22]:
pca_2=PCA(n_components=1)
pca_2.fit(pce.values)
val=pca_2.transform(pce.values)

In [23]:
train['new_2']=val[:40000]
test['new_2']=val[40000:50000]
final['new_2']=val[50000:]

In [24]:
ff_1=pd.DataFrame(pc_1[['mvar43','mvar44','mvar45']])
ff_2=pd.DataFrame(pc_2[['mvar43','mvar44','mvar45']])
ff_3=pd.DataFrame(pc_3[['mvar43','mvar44','mvar45']])
pcf=pd.concat([ff_1,ff_2,ff_3])

In [25]:
pca_3=PCA(n_components=1)
pca_3.fit(pcf.values)
val_1=pca_3.transform(pcf.values)

In [26]:
train['new_3']=val[:40000]
test['new_3']=val[40000:50000]
final['new_3']=val[50000:]

__If we subtract car related spend(mvar28,mvar29,mvar30,mvar31) from total travel spend(mvar20,mvar21,mvar22,mvar23) we can get an approximate to flight related spend which is usually paid for by using card, so this can be an important new feature__

In [27]:
for data in full_data:
    data['new_4']=np.exp(data['mvar20'])-np.exp(data['mvar28'])
    data['new_5']=np.exp(data['mvar21'])-np.exp(data['mvar29'])
    data['new_6']=np.exp(data['mvar22'])-np.exp(data['mvar30'])
    data['new_7']=np.exp(data['mvar23'])-np.exp(data['mvar31'])

__If we subtract quarter-wise total spend from total income we'll get savings which can be an important feature in our analysis__

In [28]:
for data in full_data:
    data['new_8']=((data['mvar9'])**6.66)-np.exp(data['mvar36'])-np.exp(data['mvar37'])-np.exp(data['mvar38'])-np.exp(data['mvar39'])

__Some more new features__

In [29]:
for data in full_data:
    data['new_9']=((data['mvar4'])/data['mvar2'])
    data['new_9']=np.log(data.new_9+1)

In [30]:
## Function to convert infinite values to zero
def func(x):
    if np.isinf(x):
        return 0
    else:
        return x

In [31]:
for data in full_data:
    data['new_10']=(data.mvar14+data.mvar15)/data.mvar13
    data['new_10']=data['new_10'].apply(func)
    data.loc[data['new_10'].isnull(),'new_10']=0

In [32]:
train.head(2)

Unnamed: 0,cm_key,mvar2,mvar4,mvar5,mvar6,mvar7,mvar8,mvar9,mvar10,mvar11,...,new_1,new_2,new_3,new_4,new_5,new_6,new_7,new_8,new_9,new_10
0,1,1,4,18,15.657005,0.966519,0.1831,6.748233,1,6.0899,...,-0.007747,-0.303284,-0.303284,1169.11,229.16,249.9,580.6,-309667.3,1.609438,0.083333
1,2,3,4,19,16.037434,0.667858,0.0,9.327914,1,4.2016,...,0.729257,-0.303284,-0.303284,65.89,0.0,0.0,0.0,2831651.0,0.847298,0.090909


__ Feature engineering part is over, we'll now move onto applying different machine learning algorithms for predictive model__

## Modelling

__We'll treat the problem as a classification problem where we have to classify different customers according to which card to offer them first__

__For selection of most suitable algorithm and tuning of hyperparameters we'll use stratified shufle split as the data is pretty imbalanced and for scoring_metric we'll use a custom scorer which will put more emphasis on not getting false positives and false negatives__

In [33]:
## Aggregating features and target
features=[i for i in train.columns if i not in ['mvar40','mvar41','mvar42','cm_key','mvar44','mvar45','mvar46','mvar47','mvar48','mvar49','mvar50','mvar51','mvar12','offer']]
target=['offer']

In [34]:
## Creating a custom scorer
cost_mat=np.matrix([[0,12,12,12],[12,0,6,6],[12,6,0,6],[12,6,6,0]])
def COST(y,ypred):
    conf_mat=sklearn.metrics.confusion_matrix(y,ypred)
    cost=np.sum(conf_mat*cost_mat)
    return cost
COST_scorer=sklearn.metrics.make_scorer(COST,greater_is_better=False)

In [35]:
#Splitting train into training and test set using stratifiedshufflesplit
X = train[features].values
y = train[target].values
stratSplit = StratifiedShuffleSplit(y, 1, test_size=0.20,random_state=42)
StratifiedShuffleSplit(y, n_iter=1, test_size=0.25)
for train_idx,test_idx in stratSplit:
    xtrain=X[train_idx]
    ytrain=y[train_idx]
    xtest=X[test_idx]
    ytest=y[test_idx]

__Extra Trees Classifier__

In [48]:
clf_1=ExtraTreesClassifier(n_estimators=100,max_depth=6)
print np.mean(cross_val_score(clf_1,X=xtrain,y=np.ravel(ytrain),scoring=COST_scorer,cv=5,n_jobs=2))

-230397.6


__Random Forest Classifier__

In [49]:
clf_2=RandomForestClassifier(n_estimators=100,max_depth=6)
print np.mean(cross_val_score(clf_2,X=xtrain,y=np.ravel(ytrain),scoring=COST_scorer,cv=5,n_jobs=2))

-230342.4


__XGBoost__

In [50]:
clf_3=XGBClassifier(learning_rate=0.1,n_estimators=100,objective='multi:softprob')
print np.mean(cross_val_score(clf_3,X=xtrain,y=np.ravel(ytrain),scoring=COST_scorer,cv=5,n_jobs=2))

-229874.4


__XGBoost outperforms other two classifiers, let's tune it's hyperparameters for improving it's performance further, we'll keep the learning rate fixed at 0.1__

In [38]:
clf_i1=XGBClassifier(objective='multi:softprob',learning_rate=0.1,max_depth=5,min_child_weight=1,colsample_bytree=0.8,scale_pos_weight=1)
##Parameter dictionary for tuning
param_1={'n_estimators':[50,70,100,120]}
gs_1=GridSearchCV(clf_i1,param_1,cv=5,scoring=COST_scorer)
gs_1.fit(xtrain,np.ravel(ytrain))
print gs_1.best_params_
print gs_1.best_score_
print gs_1.grid_scores_

{'n_estimators': 120}
-229401.60225
[mean: -229934.40900, std: 74.90154, params: {'n_estimators': 50}, mean: -229792.79700, std: 79.81579, params: {'n_estimators': 70}, mean: -229579.20525, std: 76.49941, params: {'n_estimators': 100}, mean: -229401.60225, std: 66.77005, params: {'n_estimators': 120}]


In [39]:
clf_i2=XGBClassifier(objective='multi:softprob',learning_rate=0.1,n_estimators=120,colsample_bytree=0.8,scale_pos_weight=1)
param_2={'max_depth':range(6,10,1),'min_child_weight':range(1,4,1)}
gs_2=GridSearchCV(clf_i2,param_2,cv=5,scoring=COST_scorer)
gs_2.fit(xtrain,np.ravel(ytrain))
print gs_2.best_params_
print gs_2.best_score_
print gs_2.grid_scores_

{'max_depth': 8, 'min_child_weight': 3}
-229099.20075
[mean: -229250.40150, std: 112.92759, params: {'max_depth': 6, 'min_child_weight': 1}, mean: -229252.80900, std: 68.13633, params: {'max_depth': 6, 'min_child_weight': 2}, mean: -229235.99250, std: 56.28499, params: {'max_depth': 6, 'min_child_weight': 3}, mean: -229192.79925, std: 88.05089, params: {'max_depth': 7, 'min_child_weight': 1}, mean: -229159.20150, std: 53.45054, params: {'max_depth': 7, 'min_child_weight': 2}, mean: -229197.61125, std: 43.20000, params: {'max_depth': 7, 'min_child_weight': 3}, mean: -229135.18650, std: 91.89211, params: {'max_depth': 8, 'min_child_weight': 1}, mean: -229144.80525, std: 108.02666, params: {'max_depth': 8, 'min_child_weight': 2}, mean: -229099.20075, std: 66.42409, params: {'max_depth': 8, 'min_child_weight': 3}, mean: -229192.80450, std: 78.35917, params: {'max_depth': 9, 'min_child_weight': 1}, mean: -229209.58725, std: 131.32189, params: {'max_depth': 9, 'min_child_weight': 2}, mean: -

__Let's train this classifier on full-training data and make predictions on test data__

In [40]:
clf=XGBClassifier(learning_rate=0.1,n_estimators=120,max_depth=8,min_child_weight=3,objective='multi:softprob',colsample_bytree=0.8,scale_pos_weight=1)
clf.fit(train[features].values,np.ravel(train[target].values))

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=3, missing=None, n_estimators=120, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

Let's check out feature importances and whether our engineered features are working well

In [43]:
index=np.argsort(clf.feature_importances_)[::-1]
importances=clf.feature_importances_
for i in index:
    print features[i],importances[i]

mvar36 0.0537407
mvar11 0.0505465
mvar5 0.0475598
mvar7 0.0417315
mvar6 0.0378736
mvar37 0.0365669
mvar38 0.0357995
new_8 0.0355921
mvar39 0.0326468
mvar9 0.0297223
mvar16 0.0280837
mvar24 0.0270881
mvar25 0.0268392
mvar26 0.0263207
mvar27 0.0261133
new_10 0.0249103
mvar17 0.0234584
new_9 0.0234377
new_4 0.0233132
new_1 0.0224213
mvar18 0.0221102
mvar20 0.0204094
mvar13 0.0196835
mvar19 0.0195383
new_7 0.0180242
new_6 0.0177338
new_5 0.0172153
mvar8 0.0170701
mvar32 0.0162612
mvar21 0.0159293
mvar22 0.0156389
mvar23 0.0146848
mvar4 0.0135855
mvar35 0.0135648
mvar33 0.0119885
mvar34 0.0111796
mvar2 0.0110966
mvar28 0.010661
mvar29 0.0100595
mvar31 0.00898098
new_2 0.00873209
mvar10 0.00825503
mvar30 0.00771576
mvar14 0.0058283
mvar15 0.00439715
mvar43 0.00371269
new_3 0.00217784


Predictions for test set

In [47]:
pred=clf.predict(test[features].values)

Let's see the confusion matrix for predicted values of training set

In [45]:
conf_mat=sklearn.metrics.confusion_matrix(np.ravel(train[target].values),clf.predict(train[features].values))
print conf_mat

[[31103     0    11     8]
 [ 2170  1265     5     7]
 [ 2187     0   460     7]
 [ 1811     0     6   960]]


__Our classifier is doing quite fine as is visible from confusion matrix, but this is not a classifiaction problem we have to call our customers in a sorted way( i.e. calling most liable to buy a card first). We solve this problem using predictproba() method of XGBoost classifier__

In [48]:
prob=clf.predict_proba(test[features].values)
prob=pd.DataFrame(prob)

In [65]:
np.sum(prob[0]>0.6186)

9000

In [66]:
def proba(a,b,c,d):
    m=max(b,c,d)
    if a>0.6186:
        return a
    else:
        return m
    
def card(a,b,c,d):
    m=max(b,c,d)
    
    if a>0.6186:
        return 'None'
    else:
        if m==b:
            return 'Supp'
        if m==c:
            return 'Elite'
        if m==d:
            return 'Credit'        

In [67]:
prob['pr']=prob.apply(lambda row: proba(row[0],row[1],row[2],row[3]),axis=1)
prob['card']=prob.apply(lambda row: card(row[0],row[1],row[2],row[4]),axis=1)

In [72]:
prob.drop([0,1,2,3],axis=1,inplace=True)
prob.drop(prob[prob.card=='None'].index,axis=0,inplace=True)
prob=prob.sort_values(by='pr',ascending=False)

In [73]:
subm=pd.read_csv('Leaderboard_Dataset.csv')
prob['key']=subm['cm_key']
prob.drop('pr',axis=1,inplace=True)
prob.head()

Unnamed: 0,card,key
9688,Elite,59689
8681,Elite,58682
7338,Supp,57339
2963,Elite,52964
4699,Supp,54700


In [74]:
prob.to_csv('Submission.csv')