In [1]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [2]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [5]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [8]:
alldat = cleanData(alldata['Description'], lowercase=True, remove_stops=True, stemming=True)

In [9]:
alldat = alldat.split(" ")

In [10]:
print alldat

[u'0', u'room', u'kind', u'clean', u'stro', u'1', u'stay', u'crown', u'plaza', u'april', u'april', u'2', u'book', u'hotel', u'hotwir', u'low', u'3', u'stay', u'husband', u'son', u'way', u'4', u'girlfriend', u'stay', u'celebr', u'5', u'room', u'one', u'nice', u'clearli', u'6', u'husband', u'stay', u'hotel', u'f', u'7', u'wife', u'stay', u'gloriou', u'citi', u'whi', u'8', u'boyfriend', u'stay', u'fairmont', u'9', u'wonder', u'staff', u'great', u'locat', u'de', u'10', u'step', u'time', u'squar', u'nice', u'room', u'stay', u'n', u'11', u'wife', u'kid', u'stay', u'valenti', u'12', u'stay', u'jolli', u'madison', u'xma', u'per', u'13', u'highli', u'recommend', u'hawthorn', u'terrac', u'14', u'found', u'hotel', u'clean', u'nice', u'locat', u'go', u'15', u'stay', u'elan', u'th', u'th', u'octob', u'16', u'pricelin', u'sent', u'us', u'hotel', u'acceptin', u'17', u'old', u'cheap', u'furnituresour', u'chair', u'simpli', u'18', u'stay', u'night', u'realli', u'ha', u'19', u'servic', u'fine', u'hotel'

In [7]:
# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

KeyboardInterrupt: 

In [7]:
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)

In [8]:
# create features
bagofwords = countvec.fit_transform(alldata['Description'])
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

In [9]:
print(bagofwords)

  (0, 293)	1
  (0, 408)	1
  (0, 50)	1
  (0, 1)	1
  (0, 223)	1
  (0, 19)	1
  (0, 164)	1
  (0, 59)	1
  (0, 357)	1
  (0, 334)	1
  (0, 7)	1
  (0, 397)	1
  (0, 493)	1
  (0, 430)	1
  (0, 292)	1
  (0, 345)	1
  (0, 498)	1
  (0, 211)	2
  (0, 395)	1
  (0, 160)	1
  (0, 300)	1
  (0, 33)	1
  (0, 451)	1
  (0, 184)	1
  (0, 65)	2
  :	:
  (68335, 169)	2
  (68335, 359)	1
  (68335, 106)	1
  (68335, 170)	1
  (68335, 473)	1
  (68335, 202)	1
  (68335, 139)	1
  (68335, 220)	1
  (68335, 485)	4
  (68335, 380)	1
  (68335, 88)	1
  (68335, 218)	8
  (68335, 376)	1
  (68335, 302)	1
  (68335, 212)	2
  (68335, 66)	1
  (68335, 24)	4
  (68335, 1)	1
  (68335, 19)	5
  (68335, 160)	2
  (68335, 65)	1
  (68335, 296)	2
  (68335, 462)	1
  (68335, 358)	2
  (68335, 410)	14


In [10]:
print(tfidfdata)

  (0, 410)	0.13273036764
  (0, 358)	0.0899677660899
  (0, 462)	0.232881494521
  (0, 226)	0.270126250927
  (0, 296)	0.168157570626
  (0, 82)	0.131084455044
  (0, 65)	0.195467351764
  (0, 184)	0.110847824195
  (0, 451)	0.100695333724
  (0, 33)	0.27909545746
  (0, 300)	0.252471529002
  (0, 160)	0.0836944843052
  (0, 395)	0.111349231137
  (0, 211)	0.26920998804
  (0, 498)	0.113221313515
  (0, 345)	0.242664759339
  (0, 292)	0.109471222058
  (0, 430)	0.194996185206
  (0, 493)	0.125176781752
  (0, 397)	0.209408409069
  (0, 7)	0.154014760258
  (0, 334)	0.191154409127
  (0, 357)	0.186441248048
  (0, 59)	0.146947453494
  (0, 164)	0.176866809407
  :	:
  (68335, 110)	0.123318650965
  (68335, 407)	0.0974446802074
  (68335, 469)	0.0794572254308
  (68335, 73)	0.136618253452
  (68335, 286)	0.0669001684072
  (68335, 347)	0.0846144283958
  (68335, 179)	0.0656976381432
  (68335, 427)	0.0954558217411
  (68335, 389)	0.101958611271
  (68335, 399)	0.092922907913
  (68335, 431)	0.109671314344
  (68335, 55)	0.

In [8]:
# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    alldata[x] = lbl.fit_transform(alldata[x])

In [9]:
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [10]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [11]:
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [12]:
# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [13]:
### set target variable

train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [15]:
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

In [16]:
# let's check cross validation score of the model
# cv score acts a unbiased estimate of models accuracy on unseen data

mod1 = GaussianNB()
target = train_feats['Is_Response']

In [20]:
## Naive Bayes 1
print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.7548793   0.74518366  0.75160545  0.75032109  0.756486  ]


In [19]:
## Naive Bayes 2 - tfidf is giving higher CV score
print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.78929122  0.795402    0.79244798  0.79822759  0.79874133]


In [21]:
# make our first set of predictions

clf1 = GaussianNB()
clf1.fit(train_feats1, target)

clf2 = GaussianNB()
clf2.fit(train_feats2, target)

GaussianNB(priors=None)

In [22]:
preds1 = clf1.predict(test_feats1)
preds2 = clf2.predict(test_feats2)

In [23]:
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [24]:
sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})
sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))

In [25]:
sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})
sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x))

In [26]:
sub1 = sub1[['User_ID', 'Is_Response']]
sub2 = sub2[['User_ID', 'Is_Response']]

In [27]:

## write submission files
sub1.to_csv('sub1_cv.csv', index=False)
sub2.to_csv('sub2_tf.csv', index=False)

In [28]:
import lightgbm as lgb

In [29]:
# set the data in format lgb accepts
d_train = lgb.Dataset(train_feats1, label = target)

In [33]:
## set parameters
## you can tune the parameters can try to better score

params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.078666, 
    'max_depth': 7, 
    'num_leaves': 21, 
    'feature_fraction': 0.5, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5}

In [37]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

KeyboardInterrupt: 

In [30]:


## get nround value which hd lowest error
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))



In [31]:
## train the model
model = lgb.train(params, d_train, num_boost_round=nround)

In [32]:
## make predictions
preds = model.predict(test_feats1)

In [33]:
# make submission

def to_labels(x):
    if x > 0.66:  # cutoff - you can change it and see if accuracy improves or plot AUC curve. 
        return "happy"
    return "not_happy"

sub3 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub3['Is_Response'] = sub3['Is_Response'].map(lambda x: to_labels(x))
sub3 = sub3[['User_ID','Is_Response']]
sub3.to_csv('sub31_lgb.csv', index=False)

In [34]:
# set data format
d_train = lgb.Dataset(train_feats2, label = target)

In [35]:
# same parameters as above
params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 5, 
    'num_leaves': 11,
    'feature_fraction': 0.3, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5}

In [36]:
## do cross validation to find nround i.e. at this round (iteration) we can expect lowest error
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0.214656 + 0.00431238
[40]	cv_agg's binary_error: 0.197652 + 0.00501838
[60]	cv_agg's binary_error: 0.181701 + 0.00499341
[80]	cv_agg's binary_error: 0.171427 + 0.00448226
[100]	cv_agg's binary_error: 0.162386 + 0.00604776
[120]	cv_agg's binary_error: 0.157325 + 0.00549851
[140]	cv_agg's binary_error: 0.153832 + 0.00510883
[160]	cv_agg's binary_error: 0.149774 + 0.00477697
[180]	cv_agg's binary_error: 0.147205 + 0.00480522
[200]	cv_agg's binary_error: 0.144328 + 0.00484425
[220]	cv_agg's binary_error: 0.142505 + 0.00518644
[240]	cv_agg's binary_error: 0.140655 + 0.00527885
[260]	cv_agg's binary_error: 0.139525 + 0.00515435
[280]	cv_agg's binary_error: 0.137907 + 0.0054511
[300]	cv_agg's binary_error: 0.13711 + 0.00551162
[320]	cv_agg's binary_error: 0.135852 + 0.00588731
[340]	cv_agg's binary_error: 0.135056 + 0.0058444
[360]	cv_agg's binary_error: 0.13372 + 0.00536832
[380]	cv_agg's binary_error: 0.13318 + 0.00556489
[400]	cv_agg's binary_error: 0.132461 + 

In [37]:
# get nround value
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))

In [38]:

# train model
model = lgb.train(params, d_train, num_boost_round=nround)

In [39]:
# make prediction
preds = model.predict(test_feats2)

In [40]:
# make submission

def to_labels(x):
    if x > 0.66:
        return "happy"
    return "not_happy"

sub4 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub4['Is_Response'] = sub4['Is_Response'].map(lambda x: to_labels(x))
sub4 = sub4[['User_ID','Is_Response']]
sub4.to_csv('sub4_lgb.csv', index=False)

In [41]:
## import library
from catboost import CatBoostClassifier,cv, Pool

In [42]:

## catboost accepts categorical columns as a list of column numbers. In this data, all columns are categorical
cat_cols = [x for x in range(502)] ## 502 == train_feats1.shape[1]

In [43]:
## set parameters
## you can refer the parameters here: https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list
param = {
    'use_best_model':True,
    'loss_function':'CrossEntropy',
    'eval_metric':'Accuracy',
    'iterations':1000,
    'depth':6,
    'learning_rate':0.03,
    'rsm':0.3,
    'random_seed':2017,
    
    
}

In [44]:
## set parameters
## you can refer the parameters here: https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list
param = {
    'use_best_model':True,
    'loss_function':'CrossEntropy',
    'eval_metric':'Accuracy',
    'iterations':1000,
    'depth':6,
    'learning_rate':0.03,
    'rsm':0.3,
    'random_seed':2017,
    
    
}

In [45]:
## for doing cross validation, set data in Pool format
my_dt =  Pool(train_feats1, 
           label=target,
           cat_features=cat_cols,
           column_description=None,
           delimiter='\t',
           has_header=None,
           weight=None, 
           baseline=None,
           feature_names=None,
           thread_count=1)

In [None]:
## run cv to get best iteration
ctb_cv = cv(param, my_dt, fold_count=5)

In [None]:
# fetch best round
best_round = ctb_cv['b\'Accuracy\'_test_avg'].index(np.max(ctb_cv['b\'Accuracy\'_test_avg']))

In [None]:
## define the classifer model
model = CatBoostClassifier(iterations=best_round, learning_rate=0.03,rsm = 0.3 ,depth=6, eval_metric='Accuracy', random_seed=2017)

In [None]:

## train model
model.fit(my_dt)

In [None]:
## make predictions
preds = model.predict(test_feats1)

In [None]:
## make submission
sub5 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub5['Is_Response'] = ['happy' if x == 1 else 'not_happy' for x in sub5['Is_Response']]
sub5 = sub5[['User_ID','Is_Response']]
sub5.to_csv('submissions/sub5_cb.csv', index=False)