In [1]:
import os
import numpy as np
import re

import pandas as pd

In [27]:
def get_edited_headline(row):
    expr = re.compile("<(.*?)/>")
    edited_headline = (re.sub(expr,row.edit,row.original)).lower()
    #print((edited_headline))
    return edited_headline

def get_context(row):
    
    expr = re.compile("<(.*?)/>")
    context = (re.sub(expr,' ',row.original)).lower()
    #print((edited_headline))
    return context

def mask_context(row):
    expr = re.compile("<(.*?)/>")
    edited_headline = (re.sub(expr,'[MASK]',row.original)).lower()
    #print((edited_headline))
    return edited_headline

def make_original_string(row):
    s=row.original
    expr = re.compile("<")
    s=re.sub(expr,'',s).lower()
    expr = re.compile(">")
    s=re.sub(expr,'',s).lower()
    return s

def enhance_df(df):
    df['edited'] = df.apply(get_edited_headline,axis=1)
    df['edited_len'] = df.apply(lambda row:len(row.edited.split(' ')),axis=1)
    df['context'] = df.apply(get_context,axis=1)
    df['original_string']=df.apply(make_original_string,axis=1)
    df['masked_context'] = df.apply(mask_context,axis=1)
    return df

In [28]:
train_df = pd.read_csv("./data/semeval_train.csv")

train_df = enhance_df(train_df)
train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,edited,edited_len,context,original_string,masked_context
0,5515,Trump <Lawyers/> Want A Second Special Counsel,toupees,32110,1.4,trump toupees want a second special counsel,7,trump want a second special counsel,trump lawyers/ want a second special counsel,trump [mask] want a second special counsel
1,12469,One industry suddenly has ' unfettered access ...,pie,21110,1.0,one industry suddenly has ' unfettered access ...,22,one industry suddenly has ' unfettered access ...,one industry suddenly has ' unfettered access ...,one industry suddenly has ' unfettered access ...
2,10947,"After healthcare vote , California Rep. Jeff D...",wrote,10000,0.2,"after healthcare vote , california rep. jeff d...",19,"after healthcare vote , california rep. jeff d...","after healthcare vote , california rep. jeff d...","after healthcare vote , california rep. jeff d..."
3,5518,Since when was there such a low bar for <defam...,planetary,10000,0.2,since when was there such a low bar for planet...,18,since when was there such a low bar for sett...,since when was there such a low bar for defama...,since when was there such a low bar for [mask]...
4,11453,"Donald Trump Unfollowed Reince Priebus , The U...",Compliment,32221,2.0,"donald trump unfollowed reince priebus , the u...",13,"donald trump unfollowed reince priebus , the u...","donald trump unfollowed reince priebus , the u...","donald trump unfollowed reince priebus , the u..."


In [30]:
train_texts = train_df.masked_context.values.tolist()
train_labels = train_df.meanGrade.values

In [31]:
test_df = pd.read_csv("./data/semeval_test.csv")

test_df = enhance_df(test_df)
test_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,pred,edited,edited_len,context,original_string,masked_context
0,7848,The Daily 202 : Loyalty is a one-way street fo...,cars,21100,0.8,0.938505,the daily 202 : loyalty is a one-way street fo...,12,the daily 202 : loyalty is a one-way street fo...,the daily 202 : loyalty is a one-way street fo...,the daily 202 : loyalty is a one-way street fo...
1,9500,Trump Bodyguard Keith Schiller Testifies Russi...,Tacos,32211,1.8,0.938505,trump bodyguard keith schiller testifies russi...,14,trump bodyguard keith schiller testifies russi...,trump bodyguard keith schiller testifies russi...,trump bodyguard keith schiller testifies russi...
2,10518,Trump Jr. says missing out on India <deals/> b...,food,32111,1.6,0.938505,trump jr. says missing out on india food becau...,14,trump jr. says missing out on india because ...,trump jr. says missing out on india deals/ bec...,trump jr. says missing out on india [mask] bec...
3,11882,WHCD Comedian Michelle Wolf : Trump a ‘ Pussy ...,Hear,10000,0.2,0.938505,whcd comedian michelle wolf : trump a ‘ pussy ...,23,whcd comedian michelle wolf : trump a ‘ pussy ...,whcd comedian michelle wolf : trump a ‘ pussy ...,whcd comedian michelle wolf : trump a ‘ pussy ...
4,1239,US calls Russia 's decision to <cut/> its dipl...,vellicate,0,0.0,0.938505,us calls russia 's decision to vellicate its d...,21,us calls russia 's decision to its diplomati...,us calls russia 's decision to cut/ its diplom...,us calls russia 's decision to [mask] its dipl...


In [32]:
test_texts = test_df.masked_context.values.tolist()
test_labels = test_df.meanGrade.values

In [33]:
from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [34]:
import torch
from transformers import AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = AutoModel.from_pretrained(model_name).to(device)

In [35]:
def tokenize(batch):
    return tokenizer(batch, padding='max_length',truncation=True,max_length=20)

In [45]:
def extract_bert_embedding(texts):

    tokens = tokenize(texts)
    input_ids_tensor = torch.tensor(tokens['input_ids']).to(device)
    attention_mask_tensor = torch.tensor(tokens['attention_mask']).to(device)
    
    with torch.no_grad():
        last_hidden_state = bert_model(input_ids_tensor, attention_mask_tensor).last_hidden_state
        last_hidden_state = last_hidden_state.cpu().numpy()
    
    embeddings = last_hidden_state[:,0,:]
    return embeddings
    

In [70]:
df = train_df
masked_context_text = df.masked_context.values.tolist()
edited_text = df.edited.values.tolist()
context_text = df.context.values.tolist()
original_text = df.original_string.values.tolist()
edit_text = df.edit.values.tolist()


masked_embedding= extract_bert_embedding(masked_context_text)
edited_embedding = extract_bert_embedding(edited_text)
context_embedding = extract_bert_embedding(context_text)
original_embedding= extract_bert_embedding(original_text)
edit_embedding = extract_bert_embedding(edit_text)

In [71]:
df = test_df
masked_context_text = df.masked_context.values.tolist()
edited_text = df.edited.values.tolist()
context_text = df.context.values.tolist()
original_text = df.original_string.values.tolist()
edit_text = df.edit.values.tolist()
test_masked_embedding= extract_bert_embedding(masked_context_text)
test_edited_embedding = extract_bert_embedding(edited_text)
test_context_embedding = extract_bert_embedding(context_text)
test_original_embedding= extract_bert_embedding(original_text)
test_edit_embedding = extract_bert_embedding(edit_text)

In [72]:
train_embedding = masked_embedding + edit_embedding
test_embeddings = test_masked_embedding + test_edit_embedding

In [85]:
test_embeddings = np.concatenate([test_masked_embedding,test_edit_embedding],axis=1)
train_embeddings = np.concatenate([masked_embedding,edit_embedding],axis=1)

In [100]:
def make_mixed_embeddings(a1,a2):
    e1 = np.concatenate([a1,a2],axis=1)
    e2 = a1 - a2
    e3 = a1 * a2
    return np.concatenate([e1,e2,e3],axis=1)

In [52]:
def extract_diff_embedding(df):
    '''difference between masked context and edit'''
    masked_context_text = df.masked_context.values.tolist()
    edited_text = df.edited.values.tolist()
    masked_embedding= extract_bert_embedding(masked_context_text)
    edited_embedding = extract_bert_embedding(edited_text)
    embedding = masked_embedding - edited_embedding
    return embedding

In [99]:
np.concatenate([masked_embedding,edit_embedding],axis=1)

(7721, 3072)

In [249]:
train_embeddings = make_mixed_embeddings(original_embedding,edited_embedding)
train_embeddings.shape

(7721, 3072)

In [250]:
test_embeddings = make_mixed_embeddings(test_original_embedding,test_edited_embedding)
test_embeddings.shape

(1931, 3072)

In [264]:
train_embeddings = extract_diff_embedding(train_df)
train_embeddings.shape

(7721, 768)

In [266]:
test_embeddings =extract_diff_embedding(test_df)
test_embeddings.shape

(1931, 768)

In [310]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Input
from keras import regularizers

In [318]:
#l2r = regularizers.l1(l1=0.00001)
l2r=None

In [324]:
model = Sequential()
 
model.add(Input(shape=(768)))

model.add(Dense(32,activation='relu',kernel_regularizer=l2r))
#model.add(Dense(256,activation='relu',kernel_regularizer=l2r))
model.add(Dense(1))
# compile the model
opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=opt, loss='mean_squared_error')
# summarize the model
print(model.summary())

Model: "sequential_32"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_78 (Dense)             (None, 32)                24608     
_________________________________________________________________
dense_79 (Dense)             (None, 1)                 33        
Total params: 24,641
Trainable params: 24,641
Non-trainable params: 0
_________________________________________________________________
None


In [325]:
model.fit(train_embeddings, train_labels, epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc7054a54d0>

In [326]:
def evaluate_predictions(pred_df):
    '''RMSE is the metric used to compare predicted score and actual score'''
    diff = (pred_df.meanGrade - pred_df.pred).values
    mean_squared_diff = np.sum(diff*diff)/diff.shape[0]
    rmse = np.sqrt(mean_squared_diff)
    return rmse

In [327]:
pred = model.predict(train_embeddings)
train_df['pred'] = pred
evaluate_predictions(train_df)

0.5541081562271556

In [328]:

pred = model.predict(test_embeddings)
test_df['pred'] = pred
evaluate_predictions(test_df)

0.5747233143664005

In [242]:
pred = model.predict(train_embeddings)
train_df['pred'] = pred
evaluate_predictions(train_df)

0.511209222045681

In [144]:

pred = model.predict(test_embeddings)
test_df['pred'] = pred
evaluate_predictions(test_df)

0.5867240416307906