In [None]:
import data
import preprocess_data
from sklearn.model_selection import train_test_split
import pickle
import tensorflow as tf

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Bidirectional, GRU, concatenate, Conv1D
from keras import layers
from keras.models import Model

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
preprocess = preprocess_data.Preprocess(data.Data())
vrm = preprocess_data.VectorRepresentationModels(preprocess)
w2v_model, w2v_Dict = vrm.word2vec(min_word_count=1)
y = preprocess.y




In [None]:
with open('clean_sents.pkl', 'rb') as file:
  clean_sents = pickle.load(file)
file.close()

In [None]:
S = []
for sent in clean_sents:
  s = [w2v_model.wv.vocab[w].index for w in sent]
  S.append(s)

In [None]:
S = tf.keras.utils.pad_sequences(S, padding="post")
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(S, y, test_size=0.25, random_state=41, stratify=y)
X_train_w2v, X_val_w2v, y_train_w2v, y_val_w2v = train_test_split(X_train_w2v, y_train_w2v, test_size=0.2)

In [None]:
embed_mat = w2v_model.wv.vectors

In [None]:
file_path = "gru_model.hdf5"

check_point = ModelCheckpoint(file_path, monitor = "val_mae", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_mae", mode = "min", patience = 5)

inp = Input(shape = (524,),)
embd = Embedding(embed_mat.shape[0], 300, weights = [embed_mat], trainable = False, mask_zero=True)(inp)
conv = Conv1D(filters=32, kernel_size=4, activation='relu')(embd)
pool1 = layers.AveragePooling1D(pool_size=2,name='pool_1')(conv)
bi_gr1 = Bidirectional(GRU(64, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(pool1)
bi_gr2 = Bidirectional(GRU(32, recurrent_dropout=0.4))(bi_gr1)
drp = Dropout(0.5)(bi_gr2)
out = Dense(1, activation='relu')(drp)
model = Model(inputs=inp, outputs=out)
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 524)]             0         
                                                                 
 embedding_6 (Embedding)     (None, 524, 300)          7134000   
                                                                 
 conv1d_6 (Conv1D)           (None, 521, 32)           38432     
                                                                 
 pool_1 (AveragePooling1D)   (None, 260, 32)           0         
                                                                 
 bidirectional_12 (Bidirecti  (None, 260, 128)         37632     
 onal)                                                           
                                                                 
 bidirectional_13 (Bidirecti  (None, 64)               31104     
 onal)                                                     

In [None]:
file_path_lstm = "lstm_model.hdf5"

check_point = ModelCheckpoint(file_path, monitor = "val_mae", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_mae", mode = "min", patience = 5)

inp = Input(shape = (524,),)
embd = Embedding(embed_mat.shape[0], 300, weights = [embed_mat], trainable = False, mask_zero=True)(inp)
conv = Conv1D(filters=32, kernel_size=4, activation='relu')(embd)
pool1 = layers.AveragePooling1D(pool_size=2,name='pool_1')(conv)
bi_gr1 = Bidirectional(LSTM(64, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(pool1)
bi_gr2 = Bidirectional(LSTM(32, recurrent_dropout=0.4))(bi_gr1)
drp = Dropout(0.5)(bi_gr2)
out = Dense(1, activation='relu')(drp)
model_lstm = Model(inputs=inp, outputs=out)
model_lstm.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
model_lstm.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 524)]             0         
                                                                 
 embedding_9 (Embedding)     (None, 524, 300)          7134000   
                                                                 
 conv1d_9 (Conv1D)           (None, 521, 32)           38432     
                                                                 
 pool_1 (AveragePooling1D)   (None, 260, 32)           0         
                                                                 
 bidirectional_18 (Bidirecti  (None, 260, 128)         49664     
 onal)                                                           
                                                                 
 bidirectional_19 (Bidirecti  (None, 64)               41216     
 onal)                                                     

In [None]:
model_lstm.fit(X_train_w2v, y_train_w2v, batch_size=64, epochs=40, validation_data=(X_val_w2v, y_val_w2v), callbacks = [check_point, early_stop])


Epoch 1/40
Epoch 1: val_mae improved from inf to 0.61031, saving model to lstm_model.hdf5
Epoch 2/40
Epoch 2: val_mae improved from 0.61031 to 0.56280, saving model to lstm_model.hdf5
Epoch 3/40
Epoch 3: val_mae improved from 0.56280 to 0.54671, saving model to lstm_model.hdf5
Epoch 4/40
Epoch 4: val_mae improved from 0.54671 to 0.53312, saving model to lstm_model.hdf5
Epoch 5/40
Epoch 5: val_mae did not improve from 0.53312
Epoch 6/40
Epoch 6: val_mae did not improve from 0.53312
Epoch 7/40
Epoch 7: val_mae improved from 0.53312 to 0.52806, saving model to lstm_model.hdf5
Epoch 8/40
Epoch 8: val_mae did not improve from 0.52806
Epoch 9/40
Epoch 9: val_mae did not improve from 0.52806
Epoch 10/40
Epoch 10: val_mae did not improve from 0.52806
Epoch 11/40
Epoch 11: val_mae did not improve from 0.52806
Epoch 12/40
Epoch 12: val_mae did not improve from 0.52806


<keras.callbacks.History at 0x7f8b4bbd30d0>

In [None]:
model.fit(X_train_w2v, y_train_w2v, batch_size=64, epochs=40, validation_data=(X_val_w2v, y_val_w2v), callbacks = [check_point, early_stop])


Epoch 1/40
Epoch 1: val_mae improved from inf to 0.72224, saving model to our_test_model.hdf5
Epoch 2/40
Epoch 2: val_mae did not improve from 0.72224
Epoch 3/40
Epoch 3: val_mae improved from 0.72224 to 0.54775, saving model to our_test_model.hdf5
Epoch 4/40
Epoch 4: val_mae did not improve from 0.54775
Epoch 5/40
Epoch 5: val_mae improved from 0.54775 to 0.54391, saving model to our_test_model.hdf5
Epoch 6/40
Epoch 6: val_mae did not improve from 0.54391
Epoch 7/40
Epoch 7: val_mae did not improve from 0.54391
Epoch 8/40
Epoch 8: val_mae did not improve from 0.54391
Epoch 9/40
Epoch 9: val_mae did not improve from 0.54391
Epoch 10/40
Epoch 10: val_mae did not improve from 0.54391


<keras.callbacks.History at 0x7f8be1051520>

In [None]:
from keras.models import load_model


In [None]:
model = load_model(file_path)

In [None]:
model_lstm = load_model(file_path_lstm)
p = model_lstm.predict(X_test_w2v)
y_pred = np.around(p)
result_aug = cohen_kappa_score(y_test_w2v,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result_aug))

Kappa Score: 0.7076978468604643


In [None]:
p = model.predict(X_test_w2v)



In [None]:
# y_pred = model.predict(x_test)
import numpy as np
y_pred = np.around(p)
from sklearn.metrics import cohen_kappa_score

result_aug = cohen_kappa_score(y_test_w2v,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result_aug))

Kappa Score: 0.6801462912265145


####gen_new_data

In [None]:
w2v_syn = {}
for word in w2v_Dict.keys():
  w2v_syn[word] = w2v_model.wv.most_similar(word)[0][0]

w2v_ant = {}
for word in w2v_Dict.keys():
  w2v_ant[word] = w2v_model.wv.most_similar(negative=[word])[0][0]

In [None]:
from generation import GEN_SAMPLES

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#generate new data using w2v model
gen_samples_w2v = GEN_SAMPLES(vrm, w2v_model, w2v_Dict, w2v_syn, w2v_ant)
new_samples_w2v, new_grad_w2v = gen_samples_w2v.generate_samples()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
w2v_vec = vrm.cleanSent_vec(w2v_model, w2v_Dict)

In [None]:
from generation import GEN_VECTORS

In [None]:
#creation of new vectors using w2v model
gen_vec_w2v = GEN_VECTORS(new_samples_w2v, new_grad_w2v, vrm, w2v_model, w2v_Dict)
new_x_w2v = gen_vec_w2v.generate_vectors(y)
new_y_w2v = gen_vec_w2v.generate_grades(new_x_w2v, w2v_vec, y)

In [None]:
len(new_x_w2v), len(new_y_w2v)

(12372, 12372)

In [None]:
len(new_samples_w2v)

12372

In [None]:
from gensim.models import KeyedVectors
from gensim import models

word2vec_path = '/content/word2VecModel.bin'
w2v_model = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
w2v_vocab = w2v_model.wv.vocab

  w2v_vocab = w2v_model.wv.vocab


In [None]:
new_S = []
for sent in new_samples_w2v:
  s = [w2v_vocab[w].index for w in sent if w in w2v_vocab]
  new_S.append(s)

In [None]:
len(new_S), len(new_y_w2v)

(12372, 12372)

In [None]:
new_S = tf.keras.utils.pad_sequences(new_S, padding="post", maxlen=524)


In [None]:
new_X_train_w2v, new_X_test_w2v, new_y_train_w2v, new_y_test_w2v = train_test_split(new_S, new_y_w2v, test_size=0.25, random_state=41)
# new_X_train_w2v, new_X_val_w2v, new_y_train_w2v, new_y_val_w2v = train_test_split(X_train_w2v, y_train_w2v, test_size=0.2)

In [None]:
y_pred = model_lstm.predict(new_X_train_w2v)
y_pred = np.around(y_pred)
result_aug = cohen_kappa_score(new_y_train_w2v,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result_aug))

Kappa Score: 0.5251998847408686


In [None]:
y_pred = model.predict(new_X_train_w2v)
y_pred = np.around(y_pred)



In [None]:
result_aug = cohen_kappa_score(new_y_train_w2v,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result_aug))

Kappa Score: 0.5144093914665215


####aug_data

In [None]:
from generation import AUGSAM

In [None]:
aug_vec_w2v, aug_y_w2v = AUGSAM(S, y, new_S, new_y_w2v)()

In [None]:
new_X_train_w2v, new_X_test_w2v, new_y_train_w2v, new_y_test_w2v = train_test_split(aug_vec_w2v, aug_y_w2v, test_size=0.25)
new_X_train_w2v, new_X_val_w2v, new_y_train_w2v, new_y_val_w2v = train_test_split(X_train_w2v, y_train_w2v, test_size=0.2)

In [None]:
model_lstm.fit(new_X_train_w2v, new_y_train_w2v, batch_size=64, epochs=40, validation_data=(new_X_val_w2v, new_y_val_w2v), callbacks = [check_point, early_stop])

Epoch 1/40
Epoch 1: val_mae improved from inf to 0.57961, saving model to lstm_model.hdf5
Epoch 2/40
Epoch 2: val_mae did not improve from 0.57961
Epoch 3/40
Epoch 3: val_mae improved from 0.57961 to 0.57567, saving model to lstm_model.hdf5
Epoch 4/40
Epoch 4: val_mae improved from 0.57567 to 0.57357, saving model to lstm_model.hdf5
Epoch 5/40
Epoch 5: val_mae improved from 0.57357 to 0.56040, saving model to lstm_model.hdf5
Epoch 6/40
Epoch 6: val_mae improved from 0.56040 to 0.56001, saving model to lstm_model.hdf5
Epoch 7/40
Epoch 7: val_mae improved from 0.56001 to 0.54451, saving model to lstm_model.hdf5
Epoch 8/40
Epoch 8: val_mae did not improve from 0.54451
Epoch 9/40
Epoch 9: val_mae did not improve from 0.54451
Epoch 10/40
Epoch 10: val_mae did not improve from 0.54451
Epoch 11/40
Epoch 11: val_mae did not improve from 0.54451
Epoch 12/40
Epoch 12: val_mae did not improve from 0.54451


<keras.callbacks.History at 0x7f8b564f6520>

In [None]:
model.fit(new_X_train_w2v, new_y_train_w2v, batch_size=64, epochs=40, validation_data=(new_X_val_w2v, new_y_val_w2v), callbacks = [check_point, early_stop])

Epoch 1/40
Epoch 1: val_mae improved from inf to 0.60680, saving model to our_test_model.hdf5
Epoch 2/40
Epoch 2: val_mae improved from 0.60680 to 0.58671, saving model to our_test_model.hdf5
Epoch 3/40
Epoch 3: val_mae did not improve from 0.58671
Epoch 4/40
Epoch 4: val_mae improved from 0.58671 to 0.57634, saving model to our_test_model.hdf5
Epoch 5/40
Epoch 5: val_mae did not improve from 0.57634
Epoch 6/40
Epoch 6: val_mae improved from 0.57634 to 0.56375, saving model to our_test_model.hdf5
Epoch 7/40
Epoch 7: val_mae did not improve from 0.56375
Epoch 8/40
Epoch 8: val_mae improved from 0.56375 to 0.55548, saving model to our_test_model.hdf5
Epoch 9/40
Epoch 9: val_mae did not improve from 0.55548
Epoch 10/40
Epoch 10: val_mae did not improve from 0.55548
Epoch 11/40
Epoch 11: val_mae did not improve from 0.55548
Epoch 12/40
Epoch 12: val_mae did not improve from 0.55548
Epoch 13/40
Epoch 13: val_mae did not improve from 0.55548


<keras.callbacks.History at 0x7f8b50a07a60>

In [None]:
model_aug = load_model(file_path)

In [None]:
model_lstm_aug = load_model(file_path_lstm)

In [None]:
y_pred = model_lstm_aug.predict(new_X_test_w2v)
y_pred = np.around(y_pred)
result_aug = cohen_kappa_score(new_y_test_w2v,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result_aug))

# y_pred = model.predict(x_test)
p = model_lstm_aug.predict(X_test_w2v)
y_pred = np.around(p)

result_aug = cohen_kappa_score(y_test_w2v,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result_aug))

Kappa Score: 0.6413153000406977
Kappa Score: 0.7164432235340807


In [None]:
y_pred = model_aug.predict(new_X_test_w2v)
y_pred = np.around(y_pred)



In [None]:
result_aug = cohen_kappa_score(new_y_test_w2v,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result_aug))

Kappa Score: 0.6345474549280177


In [None]:
# y_pred = model.predict(x_test)
p = model_aug.predict(X_test_w2v)
y_pred = np.around(p)

result_aug = cohen_kappa_score(y_test_w2v,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result_aug))

Kappa Score: 0.7030067313045353
