In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
import warnings
warnings.filterwarnings('ignore')

# Any results you write to the current directory are saved as output.

['train.csv', 'test.csv', 'sample_submission.csv']


In [2]:
train_df=pd.read_csv(r'../input/train.csv')
test_df=pd.read_csv(r'../input/test.csv')

In [3]:
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [4]:
HINDI_NEGATIVE_WORDS=['bakwas','ghatiya','bakar','bekar','sucks','fraud','madharchod',
                      'bkwaas','bkwas','gandi','ganda','faltu','sucks','shit','feku','fake','nakli',
                      'dumbass','steal','slow','garbage','very bad','fharji','farji','looted','loot','crashed','crash','बकवास']

In [5]:
def tokenizer(text):
    if text != '':
        text=text.lower()
        for word in HINDI_NEGATIVE_WORDS:
            text=text.replace(word,'worst')
        tokenized_text = ' '.join(TreebankWordTokenizer().tokenize(text))
        lammetized_text = ' '.join([WordNetLemmatizer().lemmatize(i) for i in tokenized_text.split(' ')])
        return lammetized_text
    return ''

In [6]:
X_train_df, X_val_df, Y_train, Y_val = train_test_split(train_df[['Review Text','App Version Code']], train_df['Star Rating'], test_size=0.2, random_state=0,)

In [7]:
def train_feature_extractor(X_train):
    X_train['Review Text'] = X_train['Review Text'].fillna('')
    X_train['Review Text'] = X_train['Review Text'].apply(tokenizer)
    return X_train

def test_feature_extractor(X_val):
    X_val['Review Text'] = X_val['Review Text'].fillna('')
    X_val['Review Text'] = X_val['Review Text'].apply(tokenizer)
    return X_val

In [8]:
X_train=train_feature_extractor(X_train_df)
X_val=test_feature_extractor(X_val_df)

In [9]:
X_train.head()

Unnamed: 0,Review Text,App Version Code
4854,it 's awesome,78.0
2590,it 's awesome,75.0
2466,hey niki i want my cash back not credited to m...,62.0
1875,dont install app and waste your time the app i...,
5628,use refral coad [ mahaveer4 ] to get r 20 join...,37.0


In [10]:
from keras.preprocessing.text import Tokenizer
token=Tokenizer()
token.fit_on_texts(X_train['Review Text'])

Using TensorFlow backend.


In [11]:
X_train=token.texts_to_sequences(X_train['Review Text'])
X_val=token.texts_to_sequences(X_val['Review Text'])

In [12]:
def mlen(row):
    s=row['Review Text'].split(' ')
    return len(s)

X_train_df['len']=X_train_df.apply(mlen,axis=1)
X_train_df.head()

Unnamed: 0,Review Text,App Version Code,len
4854,it 's awesome,78.0,3
2590,it 's awesome,75.0,3
2466,hey niki i want my cash back not credited to m...,62.0,17
1875,dont install app and waste your time the app i...,,27
5628,use refral coad [ mahaveer4 ] to get r 20 join...,37.0,18


In [13]:
max(X_train_df['len'])

395

In [14]:
X_val_df['len']=X_val_df.apply(mlen,axis=1)
max(X_val_df['len'])

174

In [15]:
from keras.preprocessing.sequence import pad_sequences
X_train=pad_sequences(X_train,maxlen=395,padding='post')
X_val=pad_sequences(X_val,maxlen=395,padding='post')

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
scaler = StandardScaler()
def append_version_code_feature_train(df,text_features):
    app_version_matrix = imp_mode.fit_transform(df['App Version Code'].as_matrix().reshape(-1,1))
    app_version_matrix = scaler.fit_transform(app_version_matrix)
    X = np.concatenate((app_version_matrix,text_features),axis=1)
    return X
def append_version_code_feature_test(df,text_features):
    app_version_matrix = imp_mode.transform(df['App Version Code'].as_matrix().reshape(-1,1))
    app_version_matrix = scaler.transform(app_version_matrix)
    X = np.concatenate((app_version_matrix,text_features),axis=1)
    return X


In [17]:
X_train=append_version_code_feature_train(X_train_df,X_train)
X_val=append_version_code_feature_test(X_val_df,X_val)

In [18]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder()
Y_tr=Y_train.as_matrix()
Y_tr=Y_tr.reshape(-1,1)
Y_tr=ohe.fit_transform(Y_tr)
Y_v=Y_val.as_matrix()
Y_v=Y_v.reshape(-1,1)
Y_v=ohe.transform(Y_v)

In [19]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Bidirectional


In [20]:
len(token.word_index)

4632

In [21]:
model=Sequential()
model.add(Embedding(4633,500))
model.add(Bidirectional(LSTM(1024),merge_mode='concat', weights=None))
model.add(Dense(5,activation='softmax'))

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 500)         2316500   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 2048)              12492800  
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 10245     
Total params: 14,819,545
Trainable params: 14,819,545
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [24]:
model.fit(X_train,Y_tr,batch_size=64,epochs=5,validation_data=(X_val,Y_v))

Train on 4554 samples, validate on 1139 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fcfca0e39e8>

In [25]:
from sklearn.metrics import f1_score
print('BI-LSTM')
s=model.predict(X_train)
s=np.argmax(s,axis=1)+1
print('training weighted fscore: {}'.format(f1_score(Y_train, s, average='weighted')))
s=model.predict(X_val)
s=np.argmax(s,axis=1)+1
print('test weighted fscore: {}'.format(f1_score(Y_val,s, average='weighted')))

BI-LSTM
training weighted fscore: 0.8278954431162214
test weighted fscore: 0.7051888422338611


In [26]:
def failed_examples(predictions):
    check = np.equal(Y_val,predictions,dtype=int)
    ind = np.where(check == 0)
    ind = list(ind[0])
    X_temp = X_val_df.reset_index(drop=True)
    fail_df = X_temp[X_temp.index.isin(ind)]
    Y_temp = Y_val.reset_index(drop=True)
    fail_rating = Y_temp[Y_temp.index.isin(ind)]
    fail_df['Star Rating'] = fail_rating
    #print(predictions[ind].shape)
    fail_df['Predicted Star Rating'] = predictions[ind]
    return fail_df.reset_index(drop=True)

In [27]:
df=failed_examples(s)
print(list(df[(df['Star Rating']<=2) & (df['Predicted Star Rating']>=4)]['Review Text']))

['unresponsive app .', 'expected more .', 'hopeless app. the interface is fine but credit or cashback are retained by app itself .', 'not❌', 'need to improve more', 'keep worsting', 'very good app for recharge and bill pay', 'thought of trying out niki to pay bills.. thought it would be fast ... but guess what niki just kept typing whole sunday afternoon ... sometimes it gave me the menu option again and again ... quite disappointing..', 'please add gescom in electricity bill payment', 'roobing customer', 'i want customer support contact number right now', 'hehehe ... ... ... .', "this is a promising app but sadly it just doe n't get the job done most of the time.it 's more fancy than useful .", 'why doe the app restricate the complete use of cashback credit ?', 'amazon payment is declining while booking the bus ticket', 'thnks issue sorted but account not yet credited i will give full star once refund is successful.😊😊', 'human mind is faster than your ai , so it of kinda worst every t