In [1]:
#Tools importing

#Data analysis and wrangling
import numpy as np
import pandas as pd 

#Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#Machine learning
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel

In [2]:
traindata=pd.read_csv('train.csv')
testdata=pd.read_csv('test.csv')

In [3]:
traindata.head()

Unnamed: 0,Id,date,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label
0,0,"Thu, 17 Mar 2016 01:57:16 +0000",coursera,org,0,0,multipart/alternative,23,188,0,1,38,136818,0
1,1,"Fri, 19 Jan 2018 05:20:29 +0000",google,com,0,0,multipart/alternative,1,6,0,0,44,2467,0
2,2,"Mon, 21 Aug 2017 10:54:50 +0530",iiitd,ac.in,1,0,multipart/mixed,0,1,1,0,78,2809449,2
3,3,"Fri, 3 Mar 2017 03:46:08 +0000",,,0,0,multipart/alternative,4,43,0,0,61,13775,0
4,4,"Thu, 8 Jun 2017 03:51:09 +0000 (UTC)",linkedin,com,0,0,multipart/alternative,4,26,0,0,29,22601,1


In [4]:
traindata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25066 entries, 0 to 25065
Data columns (total 14 columns):
Id                  25066 non-null int64
date                25066 non-null object
org                 24063 non-null object
tld                 24063 non-null object
ccs                 25066 non-null int64
bcced               25066 non-null int64
mail_type           25043 non-null object
images              25066 non-null int64
urls                25066 non-null int64
salutations         25066 non-null int64
designation         25066 non-null int64
chars_in_subject    25066 non-null int64
chars_in_body       25066 non-null int64
label               25066 non-null int64
dtypes: int64(10), object(4)
memory usage: 2.7+ MB


### Correct redundant mail types

In [5]:
#Show types of mails

print(traindata[['mail_type']].groupby(['mail_type'],as_index=False).size())
print(testdata[['mail_type']].groupby(['mail_type'],as_index=False).size())

mail_type
Multipart/Alternative        1
Multipart/Mixed             21
Text/Html                    1
multipart/IDM                1
multipart/alternative    18903
multipart/mixed           1556
multipart/related          488
text/html                 3292
text/plain                 780
dtype: int64
mail_type
Multipart/Alternative       1
Multipart/Mixed             4
Text/Html                   6
multipart/alternative    7955
multipart/mixed           746
multipart/related         257
text/html                1340
text/html                  26
text/plain                333
dtype: int64


In [6]:
#Create copies of training and test sets

Xdata=traindata.copy()
Xtest=testdata.copy()

#Avoid redundances due to large letters in mail_type

Xdata.mail_type=Xdata.mail_type.replace(['Multipart/Alternative','multipart/alternative'],'multipart/alternative')
Xdata.mail_type=Xdata.mail_type.replace(['Multipart/Mixed','multipart/mixed'],'multipart/mixed')
Xdata.mail_type=Xdata.mail_type.replace(['Text/Html','text/html'],'text/html')
Xtest.mail_type=Xtest.mail_type.replace(['Multipart/Alternative','multipart/alternative'],'multipart/alternative')
Xtest.mail_type=Xtest.mail_type.replace(['Multipart/Mixed','multipart/mixed'],'multipart/mixed')
Xtest.mail_type=Xtest.mail_type.replace(['Text/Html','text/html','text/html '],'text/html')

In [7]:
print(Xdata[['mail_type','label']].groupby(['mail_type'],as_index=False).mean())
print(Xtest[['mail_type','Id']].groupby(['mail_type'],as_index=False).mean())

               mail_type     label
0          multipart/IDM  0.000000
1  multipart/alternative  1.306760
2        multipart/mixed  1.604946
3      multipart/related  1.487705
4              text/html  0.995445
5             text/plain  0.288462
               mail_type           Id
0  multipart/alternative  5388.295249
1        multipart/mixed  5268.373333
2      multipart/related  5452.023346
3              text/html  5334.046647
4             text/plain  5294.936937


### Dropping null values

In [8]:
Xdata.fillna('NA', inplace=True)
Xtest.fillna('NA', inplace=True)

Xdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25066 entries, 0 to 25065
Data columns (total 14 columns):
Id                  25066 non-null int64
date                25066 non-null object
org                 25066 non-null object
tld                 25066 non-null object
ccs                 25066 non-null int64
bcced               25066 non-null int64
mail_type           25066 non-null object
images              25066 non-null int64
urls                25066 non-null int64
salutations         25066 non-null int64
designation         25066 non-null int64
chars_in_subject    25066 non-null int64
chars_in_body       25066 non-null int64
label               25066 non-null int64
dtypes: int64(10), object(4)
memory usage: 2.7+ MB


### Exploiting date

In [11]:
#Create new feature date_length

Xdata['date_length']=Xdata.date.apply(len)
Xtest['date_length']=Xtest.date.apply(len)

#Drop date feature

Xdata=Xdata.drop(['date'],axis=1)
Xtest=Xtest.drop(['date'],axis=1)

### Dropping id, date features

In [12]:
Xtrain=Xdata.drop(['Id'],axis=1)
Xtest=Xtest.drop(['Id'],axis=1)

### Working on CC feature

In [13]:
bins=[0,1,3,float("inf")]

Xtrain['cc_cut']=pd.cut(Xtrain.ccs,bins,right=False)
Xtest['cc_cut']=pd.cut(Xtest.ccs,bins,right=False)

Xtrain=pd.concat([Xtrain.drop(['cc_cut','ccs'],axis=1),pd.get_dummies(Xtrain['cc_cut'],prefix='cc_cut')],axis=1)
Xtest=pd.concat([Xtest.drop(['cc_cut','ccs'],axis=1),pd.get_dummies(Xtest['cc_cut'],prefix='cc_cut')],axis=1)

Xtrain.head(6)

Unnamed: 0,org,tld,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label,date_length,"cc_cut_[0.0, 1.0)","cc_cut_[1.0, 3.0)","cc_cut_[3.0, inf)"
0,coursera,org,0,multipart/alternative,23,188,0,1,38,136818,0,31,1,0,0
1,google,com,0,multipart/alternative,1,6,0,0,44,2467,0,31,1,0,0
2,iiitd,ac.in,0,multipart/mixed,0,1,1,0,78,2809449,2,31,0,1,0
3,,,0,multipart/alternative,4,43,0,0,61,13775,0,30,1,0,0
4,linkedin,com,0,multipart/alternative,4,26,0,0,29,22601,1,36,1,0,0
5,iiitd,ac.in,0,multipart/alternative,0,28,1,0,37,15848,2,31,1,0,0


### Working on image/char_in_body feature

In [14]:
#Create frequency feature

Xtrain['frequency']=(Xtrain['images']*Xtrain['images'])/Xtrain['chars_in_body']
Xtest['frequency']=(Xtest['images']*Xtest['images'])/Xtest['chars_in_body']

#Drop images which is an unnecessary feature

Xtrain=Xtrain.drop(['images'],axis=1)
Xtest=Xtest.drop(['images'],axis=1)

Xtrain.head()

Unnamed: 0,org,tld,bcced,mail_type,urls,salutations,designation,chars_in_subject,chars_in_body,label,date_length,"cc_cut_[0.0, 1.0)","cc_cut_[1.0, 3.0)","cc_cut_[3.0, inf)",frequency
0,coursera,org,0,multipart/alternative,188,0,1,38,136818,0,31,1,0,0,0.003866
1,google,com,0,multipart/alternative,6,0,0,44,2467,0,31,1,0,0,0.000405
2,iiitd,ac.in,0,multipart/mixed,1,1,0,78,2809449,2,31,0,1,0,0.0
3,,,0,multipart/alternative,43,0,0,61,13775,0,30,1,0,0,0.001162
4,linkedin,com,0,multipart/alternative,26,0,0,29,22601,1,36,1,0,0,0.000708


### Working on org & tld features and exploit their number of apperances

In [15]:
Xtrain['count_org'] = Xtrain.groupby('org')['org'].transform('count')
Xtrain['count_tld'] = Xtrain.groupby('tld')['tld'].transform('count')

Xtest['count_org'] = Xtest.groupby('org')['org'].transform('count')
Xtest['count_tld'] = Xtest.groupby('tld')['tld'].transform('count')

Xtrain.head()

Unnamed: 0,org,tld,bcced,mail_type,urls,salutations,designation,chars_in_subject,chars_in_body,label,date_length,"cc_cut_[0.0, 1.0)","cc_cut_[1.0, 3.0)","cc_cut_[3.0, inf)",frequency,count_org,count_tld
0,coursera,org,0,multipart/alternative,188,0,1,38,136818,0,31,1,0,0,0.003866,192,752
1,google,com,0,multipart/alternative,6,0,0,44,2467,0,31,1,0,0,0.000405,1141,11895
2,iiitd,ac.in,0,multipart/mixed,1,1,0,78,2809449,2,31,0,1,0,0.0,4388,4388
3,,,0,multipart/alternative,43,0,0,61,13775,0,30,1,0,0,0.001162,1003,1003
4,linkedin,com,0,multipart/alternative,26,0,0,29,22601,1,36,1,0,0,0.000708,978,11895


### Build correlation matrix

In [16]:
corr = Xtrain.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,bcced,urls,salutations,designation,chars_in_subject,chars_in_body,label,date_length,"cc_cut_[0.0, 1.0)","cc_cut_[1.0, 3.0)","cc_cut_[3.0, inf)",frequency,count_org,count_tld
bcced,1.0,-0.0203229,0.0237493,-0.000650006,-0.00111647,0.115862,0.0126845,-0.0210099,-0.0269127,0.026213,0.00955853,-0.0158954,0.0498884,-0.00552559
urls,-0.0203229,1.0,-0.0222952,0.151054,0.196495,0.0195937,0.169685,0.133741,0.191761,-0.122278,-0.141662,0.357538,-0.207946,0.0901618
salutations,0.0237493,-0.0222952,1.0,0.081489,0.019893,0.0614307,0.0724128,-0.0552603,-0.219302,0.12649,0.177232,-0.0799466,0.277572,-0.13101
designation,-0.000650006,0.151054,0.081489,1.0,0.134499,-0.00394399,0.0224341,0.214749,0.0336144,-0.0200208,-0.0264446,-0.0148659,-0.040869,0.101818
chars_in_subject,-0.00111647,0.196495,0.019893,0.134499,1.0,0.0256846,0.0215464,0.117842,0.0940389,-0.0705725,-0.0573729,0.030124,-0.177464,0.0910888
chars_in_body,0.115862,0.0195937,0.0614307,-0.00394399,0.0256846,1.0,0.0414409,-0.0296868,-0.0816563,0.0546344,0.057397,-0.0241099,0.118641,-0.032468
label,0.0126845,0.169685,0.0724128,0.0224341,0.0215464,0.0414409,1.0,0.0504651,-0.191122,0.128272,0.13389,0.184321,0.167438,-0.210014
date_length,-0.0210099,0.133741,-0.0552603,0.214749,0.117842,-0.0296868,0.0504651,1.0,0.163999,-0.105769,-0.119792,0.109508,-0.266771,0.263516
"cc_cut_[0.0, 1.0)",-0.0269127,0.191761,-0.219302,0.0336144,0.0940389,-0.0816563,-0.191122,0.163999,1.0,-0.732612,-0.630453,0.141878,-0.617705,0.105701
"cc_cut_[1.0, 3.0)",0.026213,-0.122278,0.12649,-0.0200208,-0.0705725,0.0546344,0.128272,-0.105769,-0.732612,1.0,-0.0664583,-0.0962895,0.38331,-0.0499338


### Labelize categorical features

In [17]:
encode_org=LabelBinarizer()
encode_tld=LabelBinarizer()
encode_mail_type=LabelBinarizer()

encode_org.fit(Xtrain[['org']])
encode_tld.fit(Xtrain[['tld']])
encode_mail_type.fit(Xtrain[['mail_type']])

encode_org.fit(Xtest[['org']])
encode_tld.fit(Xtest[['tld']])
encode_mail_type.fit(Xtest[['mail_type']])

Xtrain=pd.concat([Xtrain.drop('org', 1),pd.DataFrame(encode_org.transform(Xtrain[['org']]))], axis=1).reindex()
Xtrain=pd.concat([Xtrain.drop('tld', 1),pd.DataFrame(encode_tld.transform(Xtrain[['tld']]))], axis=1).reindex()
Xtrain=pd.concat([Xtrain.drop('mail_type', 1),pd.DataFrame(encode_mail_type.transform(Xtrain[['mail_type']]))], axis=1).reindex()

Xtest=pd.concat([Xtest.drop('org', 1),pd.DataFrame(encode_org.transform(Xtest[['org']]))], axis=1).reindex()
Xtest=pd.concat([Xtest.drop('tld', 1),pd.DataFrame(encode_tld.transform(Xtest[['tld']]))], axis=1).reindex()
Xtest=pd.concat([Xtest.drop('mail_type', 1),pd.DataFrame(encode_mail_type.transform(Xtest[['mail_type']]))], axis=1).reindex()

In [18]:
Xtrain.head()

Unnamed: 0,bcced,urls,salutations,designation,chars_in_subject,chars_in_body,label,date_length,"cc_cut_[0.0, 1.0)","cc_cut_[1.0, 3.0)",...,120,121,122,123,0,1,2,3,4,5
0,0,188,0,1,38,136818,0,31,1,0,...,0,0,0,0,0,1,0,0,0,0
1,0,6,0,0,44,2467,0,31,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0,1,1,0,78,2809449,2,31,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,43,0,0,61,13775,0,30,1,0,...,0,0,0,0,0,1,0,0,0,0
4,0,26,0,0,29,22601,1,36,1,0,...,0,0,0,0,0,1,0,0,0,0


### Normalize our datasets for some algorithms that don't work otherwise

In [19]:
X_train=np.asarray(Xtrain.drop(['label'],axis=1))
X_train=preprocessing.StandardScaler().fit(X_train).transform(X_train)

X_test=np.asarray(Xtest)
X_test=preprocessing.StandardScaler().fit(X_test).transform(X_test)

y_train=np.asarray(Xtrain.label)

print('Train set',X_train.shape,y_train.shape)
print('Test set',X_test.shape)

Train set (25066, 557) (25066,)
Test set (10745, 557)


### Train many models

In [19]:
#Cross validation of 5 algorithms (in reality, I have worked on way more, but those are the 5 bests)

models=[LogisticRegression(solver='liblinear'),
        RandomForestClassifier(n_estimators=100,oob_score=True),
        svm.SVC(),
        KNeighborsClassifier(),
        XGBClassifier()]

model_results=pd.DataFrame(data={'Model':[],'test_score_mean':[],'fit_time_mean':[]})

cross_validation_split=model_selection.ShuffleSplit(n_splits=5,test_size=.2,train_size=.8)

for model in models:
    cross_validation_results=model_selection.cross_validate(model,X_train,y_train,cv=cross_validation_split,scoring='f1_micro',return_train_score=True,verbose=3,n_jobs=-1)
    model_results=model_results.append({'Model':type(model).__name__,'test_score_mean':cross_validation_results['test_score'].mean(),'fit_time_mean':cross_validation_results['fit_time'].mean()},ignore_index=True)

model_results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.0min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   29.1s remaining:   43.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   43.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   46.4s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  5.8min remaining:  8.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  9.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_

Unnamed: 0,Model,test_score_mean,fit_time_mean
0,LogisticRegression,0.897473,53.815916
1,RandomForestClassifier,0.944309,24.10253
2,AdaBoostClassifier,0.586064,33.762582
3,SVC,0.898511,141.296922
4,KNeighborsClassifier,0.927739,6.90021
5,XGBClassifier,0.89883,141.420607


### Grid search for best classifier

In [53]:
#Grid search for Random Forest's n_estimator

param_grid={'n_estimators':[100,200,500,800,1100],
           'criterion':['gini','entropy']}

gs=model_selection.GridSearchCV(RandomForestClassifier(),param_grid,cv=cross_validation_split,scoring='f1_micro',
                                return_train_score=True,n_jobs=-1,verbose=3)
gs.fit(X_train,y_train)

print(gs.best_params_,gs.best_params_,gs.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 11.4min finished


{'criterion': 'gini', 'n_estimators': 800} {'criterion': 'gini', 'n_estimators': 800} 0.9385904255319149


In [34]:
#800 is the best parameter for RFC, so we predict with RFC(n_estimator=800)

best_model=RandomForestClassifier(n_estimators=800,oob_score=True).fit(X_train,y_train)
ypredtrain=best_model.predict(X_train)

print("%.4f" % best_model.oob_score_)

importance_df=pd.concat((pd.DataFrame(Xtrain.drop(['label'],axis=1).columns,columns=['feature']),
                         pd.DataFrame(best_model.feature_importances_,columns=['importance'])),axis=1).sort_values(by='importance',ascending=False)

importance_df

0.9471


Unnamed: 0,feature,importance
11,count_org,0.130788
1,urls,0.089071
5,chars_in_body,0.087489
10,frequency,0.079084
432,5,0.059443
...,...,...
340,327,0.000000
125,112,0.000000
130,117,0.000000
134,121,0.000000


In [83]:
model = SelectFromModel(best_model, prefit=True)
X_train= model.transform(X_train)
X_test=model.transform(X_test)
X_train.shape

(25066, 49)

In [86]:
best_model=RandomForestClassifier(n_estimators=800,oob_score=True).fit(X_train,y_train)
ypredtrain=best_model.predict(X_train)

print("%.4f" % best_model.oob_score_)

importance_df=pd.concat((pd.DataFrame(Xtrain.drop(['label'],axis=1).columns,columns=['feature']),pd.DataFrame(best_model.feature_importances_,columns=['importance'])),axis=1).sort_values(by='importance',ascending=False)
importance_df

0.9441


Unnamed: 0,feature,importance
9,frequency,0.181710
0,bcced,0.104612
4,chars_in_subject,0.098754
8,"cc_cut_[3.0, inf)",0.090299
18,6,0.068111
...,...,...
551,1,
552,2,
553,3,
554,4,


In [84]:
importance_df.head(20)

Unnamed: 0,feature,importance
10,count_org,0.137202
1,urls,0.092094
5,chars_in_body,0.087353
9,frequency,0.083567
4,chars_in_subject,0.056279
431,5,0.053389
205,193,0.051302
11,count_tld,0.040918
236,224,0.027069
6,"cc_cut_[0.0, 1.0)",0.02696


In [87]:
ypredtest=best_model.predict(X_test)
submission=testdata.copy()
submission['label']=ypredtest

submission.to_csv('submissionrfc_800.csv',columns=['Id','label'],index=False)
submission[['Id','label']].head()

Unnamed: 0,Id,label
0,0,2
1,1,0
2,2,0
3,3,3
4,4,0


### Neural Network 

In [20]:
import tensorflow
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Using TensorFlow backend.


In [21]:
model = Sequential()
N=557

model.add(Dense(N, activation='relu'))
model.add(Dense(int(N/2), activation='relu'))
model.add(Dense(4, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=100, batch_size=64)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [31]:
#argmax function I have made by myself

def maxelements(liste):

    max_index = []
    
    if liste:
        maxi = liste[0]
        
        for i,j in ((i,j) for i,j in enumerate(liste) if j >= maxi):
            
            if j == maxi:
                max_index.append(i)
                
            else:
                maxi = j
                max_index = [i]
                
    return max_index

In [32]:
y_pred=model.predict(X_test)

Y_pred=np.zeros((10745,))
Y_pred.astype('int32')

for i in range(len(Y_pred)):
    Y_pred[i]=maxelements(y_pred[i].tolist())[0] #choose the highest probability in the 4-element output

y_pred_t=Y_pred.astype('int32') #transform floar as integer so the kaggle submission works (label integers were demanded)

In [33]:
y_pred_t

array([2, 0, 0, ..., 3, 0, 0])