In [1]:
import os
import numpy as np
import re

import pandas as pd

In [2]:
def get_edited_headline(row):
    expr = re.compile("<(.*?)/>")
    edited_headline = (re.sub(expr,row.edit,row.original)).lower()
    #print((edited_headline))
    return edited_headline

def get_context(row):
    
    expr = re.compile("<(.*?)/>")
    context = (re.sub(expr,' ',row.original)).lower()
    #print((edited_headline))
    return context

def make_original_string(row):
    s=row.original
    expr = re.compile("<")
    s=re.sub(expr,'',s).lower()
    expr = re.compile(">")
    s=re.sub(expr,'',s).lower()
    return s

def enhance_df(df):
    df['edited'] = df.apply(get_edited_headline,axis=1)
    df['edited_len'] = df.apply(lambda row:len(row.edited.split(' ')),axis=1)
    df['context'] = df.apply(get_context,axis=1)
    df['original_string']=df.apply(make_original_string,axis=1)
    return df

In [3]:
train_df = pd.read_csv("./data/semeval_train.csv")

train_df = enhance_df(train_df)
train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,edited,edited_len,context,original_string
0,5515,Trump <Lawyers/> Want A Second Special Counsel,toupees,32110,1.4,trump toupees want a second special counsel,7,trump want a second special counsel,trump lawyers/ want a second special counsel
1,12469,One industry suddenly has ' unfettered access ...,pie,21110,1.0,one industry suddenly has ' unfettered access ...,22,one industry suddenly has ' unfettered access ...,one industry suddenly has ' unfettered access ...
2,10947,"After healthcare vote , California Rep. Jeff D...",wrote,10000,0.2,"after healthcare vote , california rep. jeff d...",19,"after healthcare vote , california rep. jeff d...","after healthcare vote , california rep. jeff d..."
3,5518,Since when was there such a low bar for <defam...,planetary,10000,0.2,since when was there such a low bar for planet...,18,since when was there such a low bar for sett...,since when was there such a low bar for defama...
4,11453,"Donald Trump Unfollowed Reince Priebus , The U...",Compliment,32221,2.0,"donald trump unfollowed reince priebus , the u...",13,"donald trump unfollowed reince priebus , the u...","donald trump unfollowed reince priebus , the u..."


In [4]:
train_texts = train_df.edit.values.tolist()
train_labels = train_df.meanGrade.values

In [5]:
test_df = pd.read_csv("./data/semeval_test.csv")

test_df = enhance_df(test_df)
test_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,pred,edited,edited_len,context,original_string
0,7848,The Daily 202 : Loyalty is a one-way street fo...,cars,21100,0.8,0.938505,the daily 202 : loyalty is a one-way street fo...,12,the daily 202 : loyalty is a one-way street fo...,the daily 202 : loyalty is a one-way street fo...
1,9500,Trump Bodyguard Keith Schiller Testifies Russi...,Tacos,32211,1.8,0.938505,trump bodyguard keith schiller testifies russi...,14,trump bodyguard keith schiller testifies russi...,trump bodyguard keith schiller testifies russi...
2,10518,Trump Jr. says missing out on India <deals/> b...,food,32111,1.6,0.938505,trump jr. says missing out on india food becau...,14,trump jr. says missing out on india because ...,trump jr. says missing out on india deals/ bec...
3,11882,WHCD Comedian Michelle Wolf : Trump a ‘ Pussy ...,Hear,10000,0.2,0.938505,whcd comedian michelle wolf : trump a ‘ pussy ...,23,whcd comedian michelle wolf : trump a ‘ pussy ...,whcd comedian michelle wolf : trump a ‘ pussy ...
4,1239,US calls Russia 's decision to <cut/> its dipl...,vellicate,0,0.0,0.938505,us calls russia 's decision to vellicate its d...,21,us calls russia 's decision to its diplomati...,us calls russia 's decision to cut/ its diplom...


In [6]:
test_texts = test_df.edit.values.tolist()
test_labels = test_df.meanGrade.values

In [7]:
from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
import torch
from transformers import AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = AutoModel.from_pretrained(model_name).to(device)

  return torch._C._cuda_getDeviceCount() > 0


In [9]:
def tokenize(batch):
    return tokenizer(batch, padding='max_length',truncation=True,max_length=5)

In [10]:
def extract_bert_embedding(texts):

    tokens = tokenize(texts)
    input_ids_tensor = torch.tensor(tokens['input_ids']).to(device)
    attention_mask_tensor = torch.tensor(tokens['attention_mask']).to(device)
    
    with torch.no_grad():
        last_hidden_state = bert_model(input_ids_tensor, attention_mask_tensor).last_hidden_state
        last_hidden_state = last_hidden_state.cpu().numpy()
    
    embeddings = last_hidden_state[:,0,:]
    return embeddings
    

In [11]:
train_embedding = extract_bert_embedding(train_texts)
test_embeddings = extract_bert_embedding(test_texts)

In [12]:
train_embedding.shape

(7721, 768)

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Input
import keras

In [14]:
model = Sequential()
 
model.add(Input(shape=(768)))
model.add(Dense(32,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(1))
# compile the model
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss='mean_squared_error')
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                24608     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 25,697
Trainable params: 25,697
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
model.fit(train_embedding, train_labels, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7feee4070090>

In [16]:
def evaluate_predictions(pred_df):
    '''RMSE is the metric used to compare predicted score and actual score'''
    diff = (pred_df.meanGrade - pred_df.pred).values
    mean_squared_diff = np.sum(diff*diff)/diff.shape[0]
    rmse = np.sqrt(mean_squared_diff)
    return rmse

In [17]:
pred = model.predict(train_embedding)
train_df['pred'] = pred
evaluate_predictions(train_df)

0.531665657575536

In [18]:

pred = model.predict(test_embeddings)
test_df['pred'] = pred
evaluate_predictions(test_df)

0.5574142387130734