In [1]:
import os
import numpy as np
import re

In [2]:
## Download glove embedding (reference .. https://edumunozsala.github.io/BlogEms/jupyter/nlp/classification/embeddings/python/2020/08/15/Intro_NLP_WordEmbeddings_Classification.html)
embedding_file = "../embedding/glove.6B/glove.6B.300d.txt"

In [3]:
# load the whole embedding into memory
embeddings_index = dict()
f = open(embedding_file)
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [4]:
import pandas as pd

In [67]:
def get_edited_headline(row):
    expr = re.compile("<(.*?)/>")
    edited_headline = (re.sub(expr,row.edit,row.original)).lower()
    #print((edited_headline))
    return edited_headline

In [68]:
def enhance_df(df):
    df['edited'] = df.apply(get_edited_headline,axis=1)
    df['edited_len'] = df.apply(lambda row:len(row.edited.split(' ')),axis=1)
    return df

In [69]:
train_df = pd.read_csv("./data/semeval_train.csv")

In [70]:
train_df = enhance_df(train_df)


In [71]:
from keras.preprocessing.text import Tokenizer

In [72]:
docs = train_df.edited
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [73]:
print(vocab_size)

9925


In [74]:
train_df['edited_len'].describe()

count    7721.000000
mean       12.406165
std         3.632329
min         4.000000
25%        10.000000
50%        12.000000
75%        15.000000
max        26.000000
Name: edited_len, dtype: float64

In [75]:
max_len = 20

In [76]:
train_df

Unnamed: 0,id,original,edit,grades,meanGrade,edited,edited_len
0,5515,Trump <Lawyers/> Want A Second Special Counsel,toupees,32110,1.4,trump toupees want a second special counsel,7
1,12469,One industry suddenly has ' unfettered access ...,pie,21110,1.0,one industry suddenly has ' unfettered access ...,22
2,10947,"After healthcare vote , California Rep. Jeff D...",wrote,10000,0.2,"after healthcare vote , california rep. jeff d...",19
3,5518,Since when was there such a low bar for <defam...,planetary,10000,0.2,since when was there such a low bar for planet...,18
4,11453,"Donald Trump Unfollowed Reince Priebus , The U...",Compliment,32221,2.0,"donald trump unfollowed reince priebus , the u...",13
...,...,...,...,...,...,...,...
7716,10954,If Trump wants to use this memo to fire Rosens...,tweeting,32111,1.6,if trump wants to use this memo to fire rosens...,20
7717,10932,Russia Will Test ' Unstoppable ' Satan <Missil...,Prayer,33210,1.8,russia will test ' unstoppable ' satan prayer ...,15
7718,8303,Trump campaign had contact with Russian <intel...,vodka,21111,1.2,trump campaign had contact with russian vodka ...,9
7719,9277,How Trump 's Twitter account is fueling a GOP ...,electricity,21100,0.8,how trump 's twitter account is fueling a gop ...,11


In [77]:
docs = train_df.edited

In [78]:
sample = train_df.iloc[12]

In [79]:
sample_sentence = sample.edited

In [80]:
sample_sentence

"trump tie : why it 's misunderstood and what to do about it"

In [81]:
sample_sentence_words = sample_sentence.split(' ')

In [82]:
embeddings = []
for word in sample_sentence_words:
    if word in embeddings_index:
        embeddings.append(embeddings_index[word])


In [83]:
len(embeddings)

13

In [84]:
len(sample_sentence_words)

13

In [85]:
sample.original

"Trump <Rally/> : Why it 's misunderstood and what to do about it"

In [86]:
np.mean(embeddings,axis=0).shape

(300,)

In [96]:
def compute_average_embedding(sentence):
'Average of all word embedding'
    embeddings = []
    for word in sentence:
        if word in embeddings_index:
            embeddings.append(np.asarray(embeddings_index[word]))
    if len(embeddings) == 0:
        print(sentence)
    mean_embedding = np.mean(embeddings,axis=0)
    #print(mean_embedding.shape)
    return mean_embedding
    

In [97]:
train_sentences = train_df.edited

train_embedding = [compute_average_embedding(sentence) for sentence in train_sentences]

train_embeddings = np.asarray(train_embedding)

In [106]:
labels = train_df.meanGrade.values

In [107]:
train_embeddings.shape

(7721, 300)

In [133]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Input
import keras

In [150]:
# define model
model = Sequential()
 
model.add(Input(shape=(300)))
model.add(Dense(32,activation='relu'))
model.add(Dense(32,activation='relu'))
#model.add(Flatten())
model.add(Dense(1))
# compile the model
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='mean_squared_error')
# summarize the model
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 32)                9632      
_________________________________________________________________
dense_19 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 33        
Total params: 10,721
Trainable params: 10,721
Non-trainable params: 0
_________________________________________________________________
None


In [162]:
model.fit(train_embeddings, labels, epochs=50, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f4b8c1a3090>

In [152]:
test_df = pd.read_csv("./data/semeval_test.csv")
test_df = enhance_df(test_df)

In [153]:
test_gt = test_df.meanGrade.values

In [154]:
test_sentences = test_df.edited

test_embeddings = np.asarray([compute_average_embedding(sentence) for sentence in test_sentences])

In [155]:
test_pred = model.predict(test_embeddings)

In [156]:
def evaluate_predictions(pred_df):
    '''RMSE is the metric used to compare predicted score and actual score'''
    diff = (pred_df.meanGrade - pred_df.pred).values
    mean_squared_diff = np.sum(diff*diff)/diff.shape[0]
    rmse = np.sqrt(mean_squared_diff)
    return rmse

In [157]:
test_df['pred']= test_pred

In [158]:
evaluate_predictions(test_df)

0.5796133298950806

In [159]:
train_pred = model.predict(train_embeddings)

In [160]:
train_df['pred'] = train_pred

In [161]:
evaluate_predictions(train_df)

0.5722618082490456