# NOTE: Run the following in terminal in the virtual environment and restart the kernel before running this notebook:
python3 -m spacy download en_core_web_md

In [1]:
import os 
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Imports from root dir are now possible:
from src import util

In [2]:
import numpy as np
import pandas as pd
import nltk
import spacy
import gensim.models.keyedvectors as word2vec
import re

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm
tqdm.pandas()


# Data source:
# https://www.kaggle.com/c/inls690-270-funny-news-headline/data

# Load training data

In [3]:
train_path = f'{util.DATA_DIR}/train.csv'
test_path = f'{util.DATA_DIR}/test.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

print('training set size:', len(df_train))
print('test set size:', len(df_test))

print(df_train.head)
print(df_test.head)

training set size: 7239
test set size: 2413
<bound method NDFrame.head of          id                                           original           edit  \
0     10070  Lawmaker Who Assaulted Reporter Fights Court-O...        Shaving   
1      1062  Trump rolls back Obama 's rule requiring emplo...           pets   
2     12796  ' Who the hell is <Dana Rohrabacher/> ? ' Seth...         batman   
3      1745  House Republicans just voted to gut the indepe...        laundry   
4     13366  The Coca-Cola invasion is causing Mexico ’s sl...           mail   
...     ...                                                ...            ...   
7234  12642       Trump <looms/> over Georgia special election         stands   
7235    100  Trump lawful group shake-up clears way for con...  disappearance   
7236   3310  Trump will <pardon/> conservative pundit Dines...           date   
7237   1518  Ancient ‘ frozen ’ tomb of Scythian <Prince/> ...         Scythe   
7238  14471  Theresa May orders big

# Preprocess headlines

In [4]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

def preprocess(text):
    text = text.strip()
    text = text.replace("<", "").replace("/>", "") # remove hyphens
    for w in text.split(" "):
        if not w.isalpha():
            text = text.replace(w, "")
    text = " ".join(text.split())
    if all([w[0].isupper() for w in text.split(" ") if w not in STOP_WORDS]):
        text = text.lower()
        text = text[0].upper() + text[1:]
    text = text.replace("'", "") # remove apostrophe (would cause problems later on) 
    return text

# apply
df_train["headline_preprocessed"] = df_train["original"].progress_apply(preprocess)
df_test["headline_preprocessed"] = df_test["original"].progress_apply(preprocess)

HBox(children=(FloatProgress(value=0.0, max=7239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2413.0), HTML(value='')))




# Tokenize replaced word

In [5]:
def get_replaced_token(headline):
    start = "<"
    end = "/>"
    replaced_token = headline[(headline.index(start)+len(start)):headline.index(end)].strip().lower()
    return replaced_token

df_train["replaced_token"] = df_train["original"].progress_apply(get_replaced_token)
df_test["replaced_token"] = df_test["original"].progress_apply(get_replaced_token)


HBox(children=(FloatProgress(value=0.0, max=7239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2413.0), HTML(value='')))




# Create SpaCy pipeline with added custom component for recasting multi-word entites as single tokens


In [6]:
# treat multi-word entities as individual tokens instead of multiple tokens (e.g. "New York" instead of "New" + "York")
class EntityRetokenizeComponent:
    def __init__(self, pipeline):
        pass
    
    def __call__(self, doc):
        with doc.retokenize() as retokenizer:
            for ent in doc.ents:
                retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": str(doc[ent.start:ent.end])})
        return doc

# create SpaCy pipeline
spacy_pipeline = spacy.load("en_core_web_md")
retokenizer = EntityRetokenizeComponent(spacy_pipeline) 
spacy_pipeline.add_pipe(retokenizer, name='merge_enitities', last=True)

# apply
df_train["headline_spacy_obj"] = df_train["headline_preprocessed"].progress_apply(spacy_pipeline)
df_test["headline_spacy_obj"] = df_test["headline_preprocessed"].progress_apply(spacy_pipeline)


HBox(children=(FloatProgress(value=0.0, max=7239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2413.0), HTML(value='')))




# Load pre-trained Word2Vec model (GoogleNews, 300-dim)
## Download here: https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download
## Unzip downloaded file and place in data directory  

In [7]:
w2v_path = f"{util.DATA_DIR}/GoogleNews-vectors-negative300.bin"
w2v = word2vec.KeyedVectors.load_word2vec_format(w2v_path, binary=True)


In [9]:
vocab = set(w2v.vocab)

# Tokenize headlines using SpaCy

In [10]:
def tokenize(spacy_obj):
    tokens = []
    for word in spacy_obj.doc:
        w = str(word)
        if spacy_pipeline.vocab[word.text.lower()].is_stop: continue
        if w in vocab:
            tokens.append(w)
        else:
            capitalized = " ".join([x.capitalize() for x in w.split(" ")])
            if capitalized in vocab:
                tokens.append(capitalized)
            else:
                w = w.lower()
                if w in vocab:
                    tokens.append(w)
            
    return tokens

df_train["headline_tokens"] = df_train["headline_spacy_obj"].progress_apply(tokenize)
df_test["headline_tokens"] = df_test["headline_spacy_obj"].progress_apply(tokenize)


HBox(children=(FloatProgress(value=0.0, max=7239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2413.0), HTML(value='')))




# Tokenize edit word

In [11]:
df_train["edit_token"] = df_train["edit"].progress_apply(lambda x: x.lower())
df_test["edit_token"] = df_test["edit"].progress_apply(lambda x: x.lower())


HBox(children=(FloatProgress(value=0.0, max=7239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2413.0), HTML(value='')))




# Select features for supervised training 

In [12]:
y_train = df_train["meanGrade"]

In [13]:
# Get the average word vector of all the words in all headlines

vecs = []
for i, tokens in enumerate(df_train["headline_tokens"]):
    for token in tokens:
        if token in w2v.vocab:
            vec = w2v[token]
            vecs.append(vec)
avg_vec = np.nanmean(vecs, axis=0)


In [14]:
# Method: Subtract the word vector of the 

def get_X_diff(df):
    X = np.zeros((len(df), 300))
    for i, edit_token in enumerate(df["edit_token"]):
        replaced_token = df["replaced_token"][i]
        
        if edit_token in w2v.vocab:
            edit_vec = w2v[token]
        else:
            edit_vec = avg_vec
            
        if replaced_token in w2v.vocab:
            replaced_vec = w2v[token]
        else:
            replaced_vec = avg_vec
            
        X[i,:] = edit_vec - replaced_vec
        
    return X


In [15]:
# Method: concatenate the three vectors: 
# (1) the average of the word vectors of the headline tokens
# (2) the word vector of the replaced token
# (3) the word vector of the edit token

def get_X_concat(df):
    feat_1 = np.zeros((len(df), 300))
    feat_2 = np.zeros((len(df), 300))
    feat_3 = np.zeros((len(df), 300))
    
    for i, tokens in enumerate(df["headline_tokens"]):
        vecs = []
        for token in tokens:
            if token in w2v.vocab:
                vec = w2v[token]
                vecs.append(vec)
        if len(vecs) == 0:
            vecs.append(np.zeros(300))
        feat_1[i,:] = np.mean(vecs, axis=0)
    
    for i, token in enumerate(df["replaced_token"]):
        if token in w2v.vocab:
            feat_2[i,:] = w2v[token]
        else:
            feat_2[i,:] = avg_vec
        
    for i, token in enumerate(df["edit_token"]):
        if token in w2v.vocab:
            feat_3[i,:] = w2v[token]
        else:
            feat_3[i,:] = avg_vec
            
    X = np.concatenate((feat_1, feat_2, feat_3), axis=1)     
    
    return X


# Compare validation errors of feature methods

In [16]:
X_train_diff = get_X_diff(df_train)
X_test_diff = get_X_diff(df_test)

regressor = LinearRegression()
regressor.fit(X_train_diff, y_train)
print(f"Validation error of diff method: {mean_squared_error(y_train, regressor.predict(X_train_diff))}")


Validation error of diff method: 0.34375664102877135


In [17]:
X_train_concat = get_X_concat(df_train)
X_test_concat = get_X_concat(df_test)

regressor = LinearRegression()
regressor.fit(X_train_concat, y_train)
print(f"Validation error of concat method: {mean_squared_error(y_train, regressor.predict(X_train_concat))}")


Validation error of concat method: 0.24446346473112185


# Using the feature vector produced by the concatenation method seems to provide a much lower validation error, even without parameter tuning, so we use it for as our feature vector for our model

In [18]:
X_train = X_train_concat
X_test = X_test_concat

# Run grid search on various regressor models and their respective pools of parameters

In [19]:
def run_grid_search(regressor, params_dict, save_name):
    gs = GridSearchCV(regressor, params_dict, cv=5)
    gs.fit(X_train, y_train)
    print(f"Validation error: {mean_squared_error(y_train, gs.predict(X_train))}")
    print("Best parameters: ")
    for key, value in gs.best_params_.items():
        print(f"\t{key}: {value}")
        
    y_pred = gs.predict(X_test)
    df_pred = pd.DataFrame({
        "id": df_test["id"],
        "pred": y_pred
    })
    df_pred.to_csv(f"{util.DATA_DIR}/{save_name}", index=False)
    

# Simple Linear Regression

In [20]:
regressor = LinearRegression()
params_dict = {
    "normalize": (False, True),
}
run_grid_search(regressor, params_dict, save_name="df_pred_lin_reg.csv")


Validation error: 0.24446346473112185
Best parameters: 
	normalize: False


# Lasso Regression

In [21]:
regressor = Lasso()
params_dict = {
    "alpha": (0.1, 0.5, 1.0, 2.0, 5.0),
    "normalize": (False, True),
    "tol": (1e-3, 1e-4, 1e-5),
}
run_grid_search(regressor, params_dict, save_name="df_pred_lasso.csv")


Validation error: 0.34373150819691745
Best parameters: 
	alpha: 0.1
	normalize: False
	tol: 0.001


# Ridge Regression

In [22]:
regressor = Ridge()
params_dict = {
    "alpha": (0.1, 0.5, 1.0, 2.0, 5.0),
    "normalize": (False, True),
    "tol": (1e-3, 1e-4, 1e-5),
}
run_grid_search(regressor, params_dict, save_name="df_pred_ridge.csv")


Validation error: 0.26516220346339964
Best parameters: 
	alpha: 1.0
	normalize: True
	tol: 0.001


# Multilayer Perceptron (MLP)

In [25]:
regressor = MLPRegressor(early_stopping=True)
params_dict = {
    "hidden_layer_sizes": ((1000,500,200,100,100,50,10),),
    "alpha": (1.0e-4,),
    "tol": (1.0e-4,)
}
run_grid_search(regressor, params_dict, save_name="df_pred_mlp.csv")

Validation error: 0.24401159035048253
Best parameters: 
	alpha: 0.0001
	hidden_layer_sizes: (1000, 500, 200, 100, 100, 50, 10)
	tol: 0.0001
