In [55]:
import os
import numpy as np
import re

In [56]:
## Download glove embedding (reference .. https://edumunozsala.github.io/BlogEms/jupyter/nlp/classification/embeddings/python/2020/08/15/Intro_NLP_WordEmbeddings_Classification.html)
embedding_file = "../embedding/glove.6B/glove.6B.300d.txt"

In [57]:
# load the whole embedding into memory
embeddings_index = dict()
f = open(embedding_file)
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [58]:
import pandas as pd

In [59]:
def get_edited_headline(row):
    expr = re.compile("<(.*?)/>")
    edited_headline = (re.sub(expr,row.edit,row.original)).lower()
    #print((edited_headline))
    return edited_headline

def get_context(row):
    
    expr = re.compile("<(.*?)/>")
    context = (re.sub(expr,' ',row.original)).lower()
    #print((edited_headline))
    return context

def make_original_string(row):
    s=row.original
    expr = re.compile("<")
    s=re.sub(expr,'',s).lower()
    expr = re.compile(">")
    s=re.sub(expr,'',s).lower()
    return s

def enhance_df(df):
    df['edited'] = df.apply(get_edited_headline,axis=1)
    df['edited_len'] = df.apply(lambda row:len(row.edited.split(' ')),axis=1)
    df['context'] = df.apply(get_context,axis=1)
    df['original_string']=df.apply(make_original_string,axis=1)
    return df

In [60]:
train_df = pd.read_csv("./data/semeval_train.csv")

train_df = enhance_df(train_df)
train_df.head()


In [62]:
from keras.preprocessing.text import Tokenizer

In [63]:
docs = train_df.edited
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [64]:
print(vocab_size)

9925


In [65]:
train_df['edited_len'].describe()

count    7721.000000
mean       12.406165
std         3.632329
min         4.000000
25%        10.000000
50%        12.000000
75%        15.000000
max        26.000000
Name: edited_len, dtype: float64

In [66]:
max_len = 20

In [67]:
train_df

Unnamed: 0,id,original,edit,grades,meanGrade,edited,edited_len,context,original_string
0,5515,Trump <Lawyers/> Want A Second Special Counsel,toupees,32110,1.4,trump toupees want a second special counsel,7,trump want a second special counsel,trump lawyers/ want a second special counsel
1,12469,One industry suddenly has ' unfettered access ...,pie,21110,1.0,one industry suddenly has ' unfettered access ...,22,one industry suddenly has ' unfettered access ...,one industry suddenly has ' unfettered access ...
2,10947,"After healthcare vote , California Rep. Jeff D...",wrote,10000,0.2,"after healthcare vote , california rep. jeff d...",19,"after healthcare vote , california rep. jeff d...","after healthcare vote , california rep. jeff d..."
3,5518,Since when was there such a low bar for <defam...,planetary,10000,0.2,since when was there such a low bar for planet...,18,since when was there such a low bar for sett...,since when was there such a low bar for defama...
4,11453,"Donald Trump Unfollowed Reince Priebus , The U...",Compliment,32221,2.0,"donald trump unfollowed reince priebus , the u...",13,"donald trump unfollowed reince priebus , the u...","donald trump unfollowed reince priebus , the u..."
...,...,...,...,...,...,...,...,...,...
7716,10954,If Trump wants to use this memo to fire Rosens...,tweeting,32111,1.6,if trump wants to use this memo to fire rosens...,20,if trump wants to use this memo to fire rosens...,if trump wants to use this memo to fire rosens...
7717,10932,Russia Will Test ' Unstoppable ' Satan <Missil...,Prayer,33210,1.8,russia will test ' unstoppable ' satan prayer ...,15,russia will test ' unstoppable ' satan by en...,russia will test ' unstoppable ' satan missile...
7718,8303,Trump campaign had contact with Russian <intel...,vodka,21111,1.2,trump campaign had contact with russian vodka ...,9,trump campaign had contact with russian : nyt,trump campaign had contact with russian intell...
7719,9277,How Trump 's Twitter account is fueling a GOP ...,electricity,21100,0.8,how trump 's twitter account is fueling a gop ...,11,how trump 's twitter account is fueling a gop ...,how trump 's twitter account is fueling a gop ...


In [78]:
def compute_average_embedding(sentence):
    embeddings = []
    for word in sentence:
        if word in embeddings_index:
            embeddings.append(np.asarray(embeddings_index[word]))
    if len(embeddings) == 0:
        print(sentence)
    mean_embedding = np.mean(embeddings,axis=0)
    #print(mean_embedding.shape)
    return mean_embedding
    

In [130]:
def compute_embedding_1(df):
'Dot product of both sentence averages'
    edited_sentences = df.edited
    edited_embeddings = np.asarray([compute_average_embedding(sentence) for sentence in edited_sentences])
    
    original_sentences = df.original_string
    original_embeddings = np.asarray([compute_average_embedding(sentence) for sentence in original_sentences])
    
    return (edited_embeddings * original_embeddings)

    
    

In [131]:
train_embeddings = compute_embedding_1(train_df)

In [132]:
train_emebddings = train_edited_embeddings - train_original_embeddings

In [133]:
labels = train_df.meanGrade.values

In [134]:
train_embeddings.shape

(7721, 300)

In [135]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Input
import keras

In [136]:
# define model
model = Sequential()
 
model.add(Input(shape=(300)))
model.add(Dense(32,activation='relu'))
model.add(Dense(32,activation='relu'))
#model.add(Flatten())
model.add(Dense(1))
# compile the model
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='mean_squared_error')
# summarize the model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 32)                9632      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 10,721
Trainable params: 10,721
Non-trainable params: 0
_________________________________________________________________
None


In [137]:
model.fit(train_embeddings, labels, epochs=5, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe4dc3d3450>

In [138]:
test_df = pd.read_csv("./data/semeval_test.csv")
test_df = enhance_df(test_df)

In [139]:
test_gt = test_df.meanGrade.values

In [140]:
test_embeddings = compute_embedding_1(test_df)

In [141]:
test_pred = model.predict(test_embeddings)

In [142]:
def evaluate_predictions(pred_df):
    '''RMSE is the metric used to compare predicted score and actual score'''
    diff = (pred_df.meanGrade - pred_df.pred).values
    mean_squared_diff = np.sum(diff*diff)/diff.shape[0]
    rmse = np.sqrt(mean_squared_diff)
    return rmse

In [143]:
test_df['pred']= test_pred

In [144]:
evaluate_predictions(test_df)

0.5828596518150597

In [145]:
train_pred = model.predict(train_embeddings)

In [146]:
train_df['pred'] = train_pred

In [147]:
evaluate_predictions(train_df)

0.5813737406323164

In [148]:
train_df = pd.read_csv("./data/semeval_train.csv")

train_df = enhance_df(train_df)
train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,edited,edited_len,context,original_string
0,5515,Trump <Lawyers/> Want A Second Special Counsel,toupees,32110,1.4,trump toupees want a second special counsel,7,trump want a second special counsel,trump lawyers/ want a second special counsel
1,12469,One industry suddenly has ' unfettered access ...,pie,21110,1.0,one industry suddenly has ' unfettered access ...,22,one industry suddenly has ' unfettered access ...,one industry suddenly has ' unfettered access ...
2,10947,"After healthcare vote , California Rep. Jeff D...",wrote,10000,0.2,"after healthcare vote , california rep. jeff d...",19,"after healthcare vote , california rep. jeff d...","after healthcare vote , california rep. jeff d..."
3,5518,Since when was there such a low bar for <defam...,planetary,10000,0.2,since when was there such a low bar for planet...,18,since when was there such a low bar for sett...,since when was there such a low bar for defama...
4,11453,"Donald Trump Unfollowed Reince Priebus , The U...",Compliment,32221,2.0,"donald trump unfollowed reince priebus , the u...",13,"donald trump unfollowed reince priebus , the u...","donald trump unfollowed reince priebus , the u..."
