***Import***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.optimizers import RMSprop

***Data Preprocessing***

In [None]:
data = pd.read_csv('/content/Dataset.csv')

In [None]:
train = data['Comment']
target = data['MOS']

We need to categorically encode the labels

In [None]:
from tensorflow.keras.utils import to_categorical
target = to_categorical(target)
target = np.array(target).astype('float32')

Preparing our Data 

We will turn our text into lists of integer indices

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

samples = train

#Create a tokenizer configured to take the 1000 most common words
tokenizer = Tokenizer(num_words=1000)
#Build the word index
tokenizer.fit_on_texts(samples)
#Turn strings into lists of integer indices
sequences = tokenizer.texts_to_sequences(samples)
#get the one-hot binary representations
one_hot_results = tokenizer.texts_to_matrix(samples, mode="binary")

#Recover the word index
word_index = tokenizer.word_index

Data Splitting

Training Data: 130

Testing Data: 21

In [None]:
x_train = sequences[:130]
y_train = target[:130]

x_test = sequences[130:]
y_test = target[130:]

In [None]:
max_features = 10000
max_len = 500

We will **Pad** our Data so that they all have the same length, **turn them into an integer tensor of shape (samples, word_indices)**, and then use as the first layer in your network a layer capable of handling such integer tensors(**Embedding layer** in our case)

In [None]:
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)

Loading & Preparing GloVe for word representation

In [None]:
import os 

#Parsing the GloVe word-embeddings file

embeddings_index = {}
f = open( '/content/drive/MyDrive/glove.6B.100d.txt')

for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

#Preparing the GloVe word-embeddings matrix
embedding_dim = 100

embedding_matrix = np.zeros((max_features, embedding_dim))

for word, i in word_index.items():
  if i < max_features:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


***Building the Model***

In [None]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(layers.Embedding(max_features, 100, input_length=max_len))


model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Dropout(0.2))
model.add(layers.Conv1D(32, 7,activation='relu'))

model.add(layers.GRU(32,
                     dropout=0.2,
                     recurrent_dropout=0.2))

model.add(layers.Dense(6, activation="softmax"))



Our model is ready. Let's freeze the Embedding layer to avoid loosing the learned information

In [None]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

***Training***

In [None]:
model.summary()

model.compile(optimizer=RMSprop(learning_rate=1e-4),
              loss='categorical_crossentropy',
              metrics=['acc'])

history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=16)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 100)          1000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 494, 32)           22432     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 98, 32)            0         
_________________________________________________________________
dropout (Dropout)            (None, 98, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 92, 32)            7200      
_________________________________________________________________
gru (GRU)                    (None, 32)                6336      
_________________________________________________________________
dense (Dense)                (None, 6)                 1

In [None]:
resultat= model.evaluate(x_test,y_test)



***We reached ~62% accuracy on the testing data***