In [1]:
import pandas as pd
import numpy as np

import os
import xgboost as xgb

from textblob import TextBlob
from sklearn import preprocessing,model_selection,ensemble,metrics



In [2]:
! pip install afinn

Collecting afinn
  Downloading afinn-0.1.tar.gz (52kB)
[K    100% |████████████████████████████████| 61kB 67kB/s ta 0:00:011
[?25hBuilding wheels for collected packages: afinn
  Running setup.py bdist_wheel for afinn ... [?25ldone
[?25h  Stored in directory: /home/kuhung/.cache/pip/wheels/db/72/b9/fc31810e60e4a9031807e5e645001541e4539baf5fff5f17cf
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1


In [10]:
from afinn import Afinn

! pip install vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
pd.__version__

'0.21.0'

In [4]:
np.__version__

'1.12.1'

In [5]:
xgb.__version__

'0.6'

In [6]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [7]:
train['source']="train"
test['source']="test"

train=pd.concat([train,test])

In [8]:
train.head()

Unnamed: 0,Browser_Used,Description,Device_Used,Is_Response,User_ID,source
0,Edge,The room was kind of clean but had a VERY stro...,Mobile,not happy,id10326,train
1,Internet Explorer,I stayed at the Crown Plaza April -- - April -...,Mobile,not happy,id10327,train
2,Mozilla,I booked this hotel through Hotwire at the low...,Tablet,not happy,id10328,train
3,InternetExplorer,Stayed here with husband and sons on the way t...,Desktop,happy,id10329,train
4,Edge,My girlfriends and I stayed here to celebrate ...,Tablet,not happy,id10330,train


In [9]:
train.tail()

Unnamed: 0,Browser_Used,Description,Device_Used,Is_Response,User_ID,source
29399,Chrome,I stayed at the hotel and towers for a confere...,Mobile,,id109531,test
29400,Internet Explorer,Trying to stay within the Marriott family and ...,Tablet,,id109532,test
29401,Edge,"We stayed for - nights with our little dog,ver...",Desktop,,id109533,test
29402,InternetExplorer,Stayed at the Yotel over the weekend and was v...,Desktop,,id109534,test
29403,Mozilla Firefox,The Blakely is is comfortable is every way: th...,Mobile,,id109535,test


In [11]:
train['Is_Response'].unique()

array(['not happy', 'happy', nan], dtype=object)

In [12]:
train['Is_Response'].value_counts()

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [13]:
le = preprocessing.LabelEncoder()

train['Browser_Used_LabelEncoded']=le.fit_transform(train['Browser_Used'])
train['Device_Used_LabelEncoded']=le.fit_transform(train['Device_Used'])

train['len']=train['Description'].apply(lambda x:len(x))
train['target']=train['Is_Response'].apply(lambda x:1 if x=='not happy' else 0)

In [14]:
train.columns

Index(['Browser_Used', 'Description', 'Device_Used', 'Is_Response', 'User_ID',
       'source', 'Browser_Used_LabelEncoded', 'Device_Used_LabelEncoded',
       'len', 'target'],
      dtype='object')

In [15]:
%%time

polarity=[]
subjectivity=[]
for des in train['Description']:
    blob=TextBlob(des)
    polarity.append(blob.sentiment[0])
    subjectivity.append(blob.sentiment[1])

CPU times: user 1min 48s, sys: 40 ms, total: 1min 48s
Wall time: 1min 49s


In [16]:
train['polarity']=polarity
train['subjectivity']=subjectivity

In [17]:
afinn = Afinn()
train['afinn']=train['Description'].apply(lambda x:afinn.score(x))

In [18]:
train.head()

Unnamed: 0,Browser_Used,Description,Device_Used,Is_Response,User_ID,source,Browser_Used_LabelEncoded,Device_Used_LabelEncoded,len,target,polarity,subjectivity,afinn
0,Edge,The room was kind of clean but had a VERY stro...,Mobile,not happy,id10326,train,1,1,248,1,0.34619,0.643228,7.0
1,Internet Explorer,I stayed at the Crown Plaza April -- - April -...,Mobile,not happy,id10327,train,5,1,1077,1,0.042363,0.528986,3.0
2,Mozilla,I booked this hotel through Hotwire at the low...,Tablet,not happy,id10328,train,7,2,1327,1,0.124969,0.527284,4.0
3,InternetExplorer,Stayed here with husband and sons on the way t...,Desktop,happy,id10329,train,6,0,502,0,0.7185,0.739,33.0
4,Edge,My girlfriends and I stayed here to celebrate ...,Tablet,not happy,id10330,train,1,2,1613,1,0.117192,0.533254,13.0


In [21]:
train.columns

Index(['Browser_Used', 'Description', 'Device_Used', 'Is_Response', 'User_ID',
       'source', 'Browser_Used_LabelEncoded', 'Device_Used_LabelEncoded',
       'len', 'target', 'polarity', 'subjectivity', 'afinn'],
      dtype='object')

In [22]:
test=train[train['source']=='test']
train=train[train['source']!='test']

In [29]:
X = train[['afinn','polarity','subjectivity','Browser_Used_LabelEncoded', 'Device_Used_LabelEncoded', 'len']]
y = train['target']

In [30]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state=7)

In [31]:
model = ensemble.GradientBoostingClassifier()
#model = xgb.XGBClassifier()

In [32]:
model.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [33]:
y_pre=model.predict(X_train)

In [34]:
metrics.accuracy_score(y_train,y_pre)

0.84845079467009155

In [None]:
y_pre=model.predict(X_test)

metrics.accuracy_score(y_test,y_pre)

In [26]:
model.fit(X,y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [27]:
feature = test[['polarity','subjectivity','Browser_Used_LabelEncoded', 'Device_Used_LabelEncoded', 'len']]

In [28]:
pre = model.predict(feature)

In [29]:
sub=pd.read_csv('../input/sample_submission.csv')

In [30]:
sub.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


In [31]:
out=pd.DataFrame({'User_ID':test['User_ID'],'pre':pre})

In [32]:
out['Is_Response']=out['pre'].apply(lambda x:'not_happy' if x==1 else 'happy')

In [33]:
out.drop('pre',axis=1).to_csv('../output/import_m.csv',index=False)