In [1]:
#loading libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer
import matplotlib.pyplot as plt

In [2]:
#reading data from files
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
#checking for null values in data
print('Train\n',train.isnull().any())
print('\nTest\n',test.isnull().any())

Train
 User_ID         False
Description     False
Browser_Used    False
Device_Used     False
Is_Response     False
dtype: bool

Test
 User_ID         False
Description     False
Browser_Used    False
Device_Used     False
dtype: bool


In [5]:
train.describe()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
count,38932,38932,38932,38932,38932
unique,38932,38932,11,3,2
top,id39218,We stayed here for - nights early July. The lo...,Firefox,Desktop,happy
freq,1,1,7367,15026,26521


In [6]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [7]:
# join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test],ignore_index=True)

In [8]:
alldata['Browser_Used'].value_counts()

Firefox              13043
Edge                 12437
InternetExplorer      8191
Google Chrome         8050
Mozilla Firefox       7526
Mozilla               5425
Chrome                4356
IE                    4270
Internet Explorer     3700
Safari                 670
Opera                  668
Name: Browser_Used, dtype: int64

In [9]:
browser_map = {'Firefox':'Firefox', 'Edge': 'Edge', 'InternetExplorer': 'Internet Explorer', 'Google Chrome': 'Chrome',
              'Mozilla Firefox': 'Firefox', 'Mozilla': 'Firefox', 'Chrome': 'Chrome', 'IE': 'Internet Explorer',
              'Internet Explorer': 'Internet Explorer', 'Safari': 'Safari', 'Opera': 'Opera'}
alldata['Browser_Used'] = alldata['Browser_Used'].map(browser_map)

In [10]:
alldata['Browser_Used'].value_counts()

Firefox              25994
Internet Explorer    16161
Edge                 12437
Chrome               12406
Safari                 670
Opera                  668
Name: Browser_Used, dtype: int64

In [11]:
alldata['Device_Used'].value_counts()

Desktop    26375
Mobile     26214
Tablet     15747
Name: Device_Used, dtype: int64

In [12]:
alldata.pivot_table(index='Device_Used', columns='Browser_Used',values='Is_Response', aggfunc=len)

Browser_Used,Chrome,Edge,Firefox,Internet Explorer,Opera,Safari
Device_Used,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Desktop,4903,4693,9872,6389,252,266
Mobile,4842,4579,9947,6345,262,239
Tablet,2661,3165,6175,3427,154,165


In [13]:
# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [14]:
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,2), min_df=150, max_features=1000)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,2), min_df=150, max_features=1000)

In [15]:
# create features
bagofwords = countvec.fit_transform(alldata['Description'])
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

In [16]:
# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']
alldata_dummy = pd.get_dummies(alldata, columns=cols)

In [17]:
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [18]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [19]:
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [20]:
# split the merged data file into train and test respectively
train_feats = alldata_dummy[~pd.isnull(alldata.Is_Response)]
test_feats = alldata_dummy[pd.isnull(alldata.Is_Response)]

In [21]:
# set target variable
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
cols = list(alldata_dummy.columns)
cols.remove('User_ID')
cols.remove('Description')
cols.remove('Is_Response')

In [23]:
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [24]:
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

test_feats2.reset_index(drop=True, inplace=True)

In [25]:
#Naive Bayes Model
mod1 = GaussianNB()
target = train_feats['Is_Response']

In [26]:
# Naive Bayes 1
print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.8030303   0.79052145  0.80310814  0.79976882  0.80002569]


In [27]:
# Naive Bayes 2 - tfidf is giving higher CV score
print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.82331793  0.8267403   0.82751092  0.826355    0.81877729]


In [28]:
# make our first set of predictions

clf1 = GaussianNB()
clf1.fit(train_feats1, target)

clf2 = GaussianNB()
clf2.fit(train_feats2, target)

GaussianNB(priors=None)

In [29]:
preds1 = clf1.predict(test_feats1)
preds2 = clf2.predict(test_feats2)

In [30]:
sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})
sub1['Is_Response'] = sub1['Is_Response'].map({1: "happy", 0:"not_happy"})

In [31]:
sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})
sub2['Is_Response'] = sub2['Is_Response'].map({1: "happy", 0:"not_happy"})

In [32]:
sub1 = sub1[['User_ID', 'Is_Response']]
sub2 = sub2[['User_ID', 'Is_Response']]

In [33]:
## write submission files
sub1.to_csv('submissions/sub1_cv.csv', index=False)
sub2.to_csv('submissions/sub2_tf.csv', index=False)

# LightGBM - 1

In [34]:
import lightgbm as lgb

In [35]:
# set the data in format lgb accepts
d_train = lgb.Dataset(train_feats1, label = target)

In [36]:
## set parameters
## you can tune the parameters can try to better score

params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 7, 
    'num_leaves': 21, 
    'feature_fraction': 0.3, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5}

In [37]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True,
                verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0.206411 + 0.00271799
[40]	cv_agg's binary_error: 0.183345 + 0.00271871
[60]	cv_agg's binary_error: 0.164184 + 0.00412972
[80]	cv_agg's binary_error: 0.155502 + 0.0042282
[100]	cv_agg's binary_error: 0.14908 + 0.0027582
[120]	cv_agg's binary_error: 0.144251 + 0.00252273
[140]	cv_agg's binary_error: 0.140424 + 0.00274868
[160]	cv_agg's binary_error: 0.13652 + 0.00282142
[180]	cv_agg's binary_error: 0.134183 + 0.00270876
[200]	cv_agg's binary_error: 0.131614 + 0.00267329
[220]	cv_agg's binary_error: 0.129687 + 0.00344733
[240]	cv_agg's binary_error: 0.128994 + 0.00399571
[260]	cv_agg's binary_error: 0.127607 + 0.00414706
[280]	cv_agg's binary_error: 0.126143 + 0.00410722
[300]	cv_agg's binary_error: 0.125269 + 0.00422743
[320]	cv_agg's binary_error: 0.123831 + 0.004572
[340]	cv_agg's binary_error: 0.123189 + 0.00454956
[360]	cv_agg's binary_error: 0.122136 + 0.00393772
[380]	cv_agg's binary_error: 0.122239 + 0.00410518
[400]	cv_agg's binary_error: 0.121725 + 0

In [38]:
## get nround value which hd lowest error
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))

In [39]:
## train the model
model = lgb.train(params, d_train, num_boost_round=nround)

In [40]:
## make predictions
preds = model.predict(test_feats1)

In [41]:
# make submission

def to_labels(x):
    if x > 0.66:  # cutoff - you can change it and see if accuracy improves or plot AUC curve. 
        return "happy"
    return "not_happy"

sub3 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub3['Is_Response'] = sub3['Is_Response'].map(lambda x: to_labels(x))
sub3 = sub3[['User_ID','Is_Response']]
sub3.to_csv('submissions/sub3_lgb.csv', index=False) # 0.85518

# PCA

In [42]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [43]:
scaler = StandardScaler()
train_feats1_scaled = scaler.fit_transform(train_feats1)
test_feats1_scaled = scaler.fit_transform(test_feats1)

In [44]:
pca= PCA(n_components=1000)
pca.fit(train_feats1_scaled)

PCA(copy=True, iterated_power='auto', n_components=1000, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [45]:
train_feats1_pca = pca.transform(train_feats1_scaled)
test_feats1_pca = pca.transform(test_feats1_scaled)

In [46]:
# Naive Bayes 1
print(cross_val_score(mod1, train_feats1_pca, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.63841808  0.6329309   0.63575649  0.63871051  0.64076548]


In [47]:
train_feats1_pca.shape

(38932, 1000)

In [48]:
plt.plot(cum_sum)
plt.show()

NameError: name 'cum_sum' is not defined

In [None]:
train_feats1.describe()