In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')

In [2]:
#Pre-processing steps. 
stop_words = set(stopwords.words('english'))
def pre_process(msg):
    msg = str(msg)
    msg = msg.lower()
    msg = re.sub('[^a-zA-Z]',' ', msg)
    msg = nltk.word_tokenize(msg)
    msg = [wordnet.lemmatize(word) for word in msg if word not in stop_words]
    msg = ' '.join(msg)
    return msg

In [3]:
df = pd.read_excel('Training_1.xlsx')

In [4]:
df_new = df[['title','sentence','Quality','Features','Purchase/interaction experience (delivery/packaging, customer care etc)','Price']]
del df

In [5]:
df_new['Quality'] = pd.to_numeric(df_new['Quality'],errors = 'coerce')

In [6]:
#fill na values with zero 
df_new.fillna(value=0,inplace=True)

In [7]:
#a function to detect rows that don't have any classification labels
#For that we will take sum of all the classes, if the sum is zero, that row has no classes and can be removed
def na_class(df):
    idx_lst = []
    for i in range(0,len(df)):
        sum = df['Quality'].iloc[i] + df['Features'].iloc[i] + df['Purchase/interaction experience (delivery/packaging, customer care etc)'].iloc[i] + df['Price'].iloc[i]
        if sum == 0 :
            idx_lst.append(i)
    return idx_lst

In [8]:
# call the function
idx_lst = na_class(df_new)

In [9]:
#we can drop these rows
df_new.drop(idx_lst, inplace=True)

In [10]:
def process(df):
    df_new1 = pd.DataFrame()
    df_new1['title'] = df['title'].apply(lambda x: pre_process(x))
    df_new1['sentence'] = df['sentence'].apply(lambda x: pre_process(x))
    text = df_new1['title'] + ' ' + df_new1['sentence']
    return text

In [11]:
#function to create tokens 
def create_tokens(doc):
    doc = nlp(doc)
    tokens = [token.text for token in doc]
    tokens = list(dict.fromkeys(tokens))
    tokens = ' '.join(tokens)
    return tokens

In [12]:
# function to pre_process and get tokens for all the text files
def process(df):
    df_new1 = pd.DataFrame()
    df_new1['title'] = df['title'].apply(lambda x: pre_process(x))
    df_new1['sentence'] = df['sentence'].apply(lambda x: pre_process(x))
    text = df_new1['title'] + ' ' + df_new1['sentence']
    document = list()
    for i in range(0,len(text)):
        tokens = create_tokens(text.iloc[i])
        document.append(tokens)
    return document

In [13]:
X_data = process(df_new)

In [14]:
Y_data = df_new[['Quality','Features','Purchase/interaction experience (delivery/packaging, customer care etc)','Price']]

#### Part 2

In [15]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [16]:
#split dataset into train and validation
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.20, random_state=42)

In [18]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [19]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('glove.6B.100d.txt', encoding="utf8")

In [20]:
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [21]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [22]:
#define model
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(4, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

Instructions for updating:
Colocations handled automatically by placer.


In [23]:
print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 100)          481500    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 516       
Total params: 599,264
Trainable params: 117,764
Non-trainable params: 481,500
_________________________________________________________________
None


In [24]:
history = model.fit(X_train, y_train, batch_size=10, epochs=20, verbose=1, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 6338 samples, validate on 1585 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
