In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')

In [2]:
#Pre-processing steps. 
stop_words = set(stopwords.words('english'))
def pre_process(msg):
    msg = str(msg)
    msg = msg.lower()
    msg = re.sub('[^a-zA-Z]',' ', msg)
    msg = nltk.word_tokenize(msg)
    msg = [wordnet.lemmatize(word) for word in msg if word not in stop_words]
    msg = ' '.join(msg)
    return msg

In [3]:
df = pd.read_excel('Training_1.xlsx')

In [4]:
df.head()

Unnamed: 0,Name,category,date,author,title,rating,sentence,Sentiment (1 for positive and 0 for negative),Quality,Features,"Purchase/interaction experience (delivery/packaging, customer care etc)",Price,Rand
0,Ansh,audio,2019-03-14,Shekhar Kumar,Amazing earphones with value for money,5,Amazing earphones with value for money,1.0,,,,1.0,88.0
1,Ansh,audio,2019-05-29,Pravin velumani,worst quality,2,"in the beginning the sound quantity, bass are ...",0.0,,,1.0,,89.0
2,Ansh,audio,2019-09-11,Soubhagya Biswal,Best buy in the budget,5,Using it for more than a month. Sound quality ...,1.0,1.0,,,,84.0
3,Ansh,audio,2019-09-11,Soubhagya Biswal,Best buy in the budget,5,It has magnetic hold system also. Cable is ver...,1.0,,1.0,,,89.0
4,Ansh,audio,2019-09-11,Soubhagya Biswal,Best buy in the budget,5,3.5mm jack is solid to take heavy strain. All ...,1.0,,1.0,,,87.0


In [5]:
df.columns

Index(['Name', 'category', 'date', 'author', 'title', 'rating', 'sentence',
       'Sentiment (1 for positive and 0 for negative)', 'Quality', 'Features',
       'Purchase/interaction experience (delivery/packaging, customer care etc)',
       'Price', 'Rand'],
      dtype='object')

In [6]:
df_new = df[['title','sentence','Quality','Features','Purchase/interaction experience (delivery/packaging, customer care etc)','Price']]
del df

In [7]:
df_new['Quality'] = pd.to_numeric(df_new['Quality'],errors = 'coerce')

In [8]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11174 entries, 0 to 11173
Data columns (total 6 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   title                                                                    11174 non-null  object 
 1   sentence                                                                 11174 non-null  object 
 2   Quality                                                                  2742 non-null   float64
 3   Features                                                                 3465 non-null   float64
 4   Purchase/interaction experience (delivery/packaging, customer care etc)  2824 non-null   float64
 5   Price                                                                    1425 non-null   float64
dtypes: float64(4), object(2)
memory usage: 523.9+ KB


In [9]:
#fill na values with zero 
df_new.fillna(value=0,inplace=True)

In [10]:
#a function to detect rows that don't have any classification labels
#For that we will take sum of all the classes, if the sum is zero, that row has no classes and can be removed
def na_class(df):
    idx_lst = []
    for i in range(0,len(df)):
        sum = df['Quality'].iloc[i] + df['Features'].iloc[i] + df['Purchase/interaction experience (delivery/packaging, customer care etc)'].iloc[i] + df['Price'].iloc[i]
        if sum == 0 :
            idx_lst.append(i)
    return idx_lst

In [11]:
# call the function
idx_lst = na_class(df_new)

In [12]:
print("Percentage of rows with no class:{}".format(len(idx_lst)/len(df_new)*100))

Percentage of rows with no class:11.365670306067658


In [13]:
#we can drop these rows
df_new.drop(idx_lst, inplace=True)

In [14]:
#function to create tokens 
def create_tokens(doc):
    doc = nlp(doc)
    tokens = [token.text for token in doc]
    tokens = list(dict.fromkeys(tokens))
    tokens = ' '.join(tokens)
    return tokens

In [15]:
# function to pre_process and get tokens for all the text files
def process(df):
    df_new1 = pd.DataFrame()
    df_new1['title'] = df['title'].apply(lambda x: pre_process(x))
    df_new1['sentence'] = df['sentence'].apply(lambda x: pre_process(x))
    text = df_new1['title'] + ' ' + df_new1['sentence']
    document = list()
    for i in range(0,len(text)):
        tokens = create_tokens(text.iloc[i])
        document.append(tokens)
    return document

In [16]:
X_data = process(df_new)

In [17]:
Y_data = df_new[['Quality','Features','Purchase/interaction experience (delivery/packaging, customer care etc)','Price']]

#### Part 2 

In [18]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [19]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [20]:
# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])

In [21]:
# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [22]:
#split dataset into train and validation
from sklearn.model_selection import train_test_split

In [23]:
x_train,x_test,y_train,y_test = train_test_split(X_data,Y_data,test_size = 0.25,random_state = 1)

In [24]:
# create tokenizer
tokenizer = create_tokenizer(x_train)
# calculate max document length
length = max_length(x_train)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)

Max document length: 66
Vocabulary size: 4700


In [25]:
# encode data
trainX = encode_text(tokenizer, x_train, length)
print(trainX.shape)

(7428, 66)


In [26]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('glove.6B.100d.txt', encoding="utf8")

In [27]:
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [28]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [29]:
# define the model
def define_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(4, activation='softmax')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    print(model.summary())
    #plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

In [30]:
# define model
model = define_model(length, vocab_size)
# fit model
model.fit([trainX,trainX,trainX], y_train.values, epochs=50, batch_size=10)
# save the model

Instructions for updating:
Colocations handled automatically by placer.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 66)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 66)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 66)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 66, 100)      470000      input_1[0][0]                    
____________________

Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x232214f1bc8>

In [31]:
testX = encode_text(tokenizer, x_test, length)
print(testX.shape)

(2476, 66)


In [32]:
y_preds = model.predict([testX,testX,testX])

In [33]:
y_predf = []
for i in y_preds:
    lst1 = [] 
    for j in i:
        if j > 0.5:
            j =1 
        else:
            j=0
        lst1.append(j)
    y_predf.append(lst1)

In [34]:
from sklearn.metrics import classification_report

In [35]:
print(classification_report(y_test, y_predf))

              precision    recall  f1-score   support

           0       0.78      0.63      0.70       679
           1       0.75      0.71      0.73       889
           2       0.69      0.65      0.67       686
           3       0.77      0.72      0.75       353

   micro avg       0.74      0.67      0.71      2607
   macro avg       0.75      0.68      0.71      2607
weighted avg       0.74      0.67      0.71      2607
 samples avg       0.71      0.69      0.69      2607



  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
model.save('model4.h5')