In [1]:
import os
import numpy as np
import re

In [2]:
## Download glove embedding (reference .. https://edumunozsala.github.io/BlogEms/jupyter/nlp/classification/embeddings/python/2020/08/15/Intro_NLP_WordEmbeddings_Classification.html)
embedding_file = "../embedding/glove.6B/glove.6B.300d.txt"

In [3]:
# load the whole embedding into memory
embeddings_index = dict()
f = open(embedding_file)
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [4]:
import pandas as pd

In [14]:
def enhance_df(df):
    df['edited'] = df.apply(get_edited_headline,axis=1)
    df['edited_len'] = df.apply(lambda row:len(row.edited.split(' ')),axis=1)
    return df

In [15]:
train_df = pd.read_csv("./data/semeval_train.csv")

In [16]:
def get_edited_headline(row):
    expr = re.compile("<(.*?)/>")
    edited_headline = re.sub(expr,row.edit,row.original)
    return edited_headline

In [17]:
train_df = enhance_df(train_df)


In [18]:
from keras.preprocessing.text import Tokenizer

In [19]:
docs = train_df.edited
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [20]:
print(vocab_size)

9925


In [21]:
train_df['edited_len'].describe()

count    7721.000000
mean       12.406165
std         3.632329
min         4.000000
25%        10.000000
50%        12.000000
75%        15.000000
max        26.000000
Name: edited_len, dtype: float64

In [22]:
max_len = 20

In [23]:
from keras.preprocessing.sequence import pad_sequences
encoded_docs = t.texts_to_sequences(docs)
padded_train_docs = pad_sequences(encoded_docs, maxlen=max_len, padding='post')

In [24]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [26]:
labels = train_df.meanGrade.values

In [27]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [29]:
# define model
model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=20, trainable=False)
model.add(e)
model.add(Dense(32,activation='relu'))
model.add(Flatten())
model.add(Dense(1))
# compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 300)           2977500   
_________________________________________________________________
dense (Dense)                (None, 20, 32)            9632      
_________________________________________________________________
flatten (Flatten)            (None, 640)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 641       
Total params: 2,987,773
Trainable params: 10,273
Non-trainable params: 2,977,500
_________________________________________________________________
None


In [30]:
# fit the model
model.fit(padded_train_docs, labels, epochs=50, verbose=1)




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f91dc259f90>

In [31]:
test_df = pd.read_csv("./data/semeval_test.csv")
test_df = enhance_df(test_df)

In [32]:
test_docs =test_df.edited
encoded_test = t.texts_to_sequences(test_docs)
padded_test = pad_sequences(encoded_test, maxlen=max_len, padding='post')

In [33]:
test_gt = test_df.meanGrade.values

In [34]:
test_preds=model.predict(padded_test)

In [35]:
test_df['pred'] = test_preds

In [36]:
def evaluate_predictions(pred_df):
    '''RMSE is the metric used to compare predicted score and actual score'''
    diff = (pred_df.meanGrade - pred_df.pred).values
    mean_squared_diff = np.sum(diff*diff)/diff.shape[0]
    rmse = np.sqrt(mean_squared_diff)
    return rmse

In [37]:
evaluate_predictions(test_df)

0.7387709270806264

In [39]:
train_preds = model.predict(padded_train_docs)
train_df['pred']=train_preds
evaluate_predictions(train_preds)

AttributeError: 'numpy.ndarray' object has no attribute 'meanGrade'