# Sem Eval Task - Detecting Humor in News Headlines
By: Kat Young, Varsha, Humera


### Data Preprocessing - written by Kat, Varsha, and Humera


In [1]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/kat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from string import digits
import nltk
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer

################ Read in Data Set ################
dataset = pd.read_csv("train.csv")


######### Remove entries with 0 grades ##########
dataset = dataset[dataset['grades'] != 0]


##### Drop unnecessary feature cols "id" and "grades" #####
dataset = dataset.drop(['id','grades'], axis=1)


#### Replace word in news headline and save to new col ####
dataset['replaced_sentence'] = ""
storage_array = []
for index, row in dataset.iterrows():
    new = re.sub('<.*/>', row['edit'], row['original'], flags=re.DOTALL)
    storage_array.append(new)
dataset['replaced_sentence'] = storage_array


############ Convert all characters to lowercase ############
dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x.lower() for x in x.split()))


##################### Remove Stop Words #####################
stop = stopwords.words('english')
dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


##################### Remove punctuation ##################### 
dataset['replaced_sentence'] = dataset['replaced_sentence'].str.replace('[^\w\s]','')


############ Remove common words along with 's' and 'nt' ############
frequent_words = pd.Series(' '.join(dataset['replaced_sentence']).split()).value_counts()[:10]
words_to_remove = ['s', 'nt']
dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove))


##################### Remove rare words ##################### 
rare = pd.Series(' '.join(dataset['replaced_sentence']).split()).value_counts()[-10:]
rare = list(rare.index)
dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))


####################### Remove Digits #######################
dataset['replaced_sentence'] = dataset['replaced_sentence'].str.replace('\d+', '')


############ Create new column with tokenized words ############
tokenized = [nltk.word_tokenize(sent) for sent in dataset['replaced_sentence']]
dataset['tokenized'] = tokenized


####################### Lemmatize Words #######################
lemm = WordNetLemmatizer()
lemm_each_row = []
for index, row in dataset.iterrows():
    lemmatized_output = ' '.join([lemm.lemmatize(w) for w in row['tokenized']])
    lemmatized_output = lemmatized_output.split(' ')
    lemm_each_row.append(lemmatized_output)
dataset['tokenized'] = lemm_each_row


In [3]:
############# Load vectors directly from the file #############
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

vectors_array = []

####### Get the word embeddings for each token in the model, store in column #######
for index, row in dataset.iterrows():
    each_sentence = []
    for x in row['tokenized']:
        if x in model:
            vectors = model[x]
            each_sentence.append(vectors)
    vectors_array.append(each_sentence)

dataset['word embeddings'] = vectors_array

## Written by Kat, Varsha, and Humera

### Calculate Sentence Embeddings - by Kat

Sentence embeddings here are in the form of a one 300-dimension vector that represents the whole sentence. 

In [4]:
import numpy as geek

## This array holds all final sentence embeddings
sentence_embeddings_array = []

word_embedding_dim = len(dataset['word embeddings'][0][0])

for index, row in dataset.iterrows():
    base = np.zeros(word_embedding_dim)
    count = 0
    for x in row['word embeddings']:
        base = geek.add(base, x)
        count += 1
    for i in range(0, len(base)):
        base[i] = base[i] / count
    sentence_embeddings_array.append(base)
    
dataset['sentence embeddings'] = sentence_embeddings_array

dataset.head()

## Written by Kat

Unnamed: 0,original,edit,meanGrade,replaced_sentence,tokenized,word embeddings,sentence embeddings
0,France is ‘ hunting down its citizens who join...,twins,0.2,france hunting citizens joined twins without t...,"[france, hunting, citizen, joined, twin, witho...","[[-0.20605469, -0.16699219, 0.19238281, 0.2490...","[-0.01800537109375, 0.071014404296875, -0.0436..."
1,"Pentagon claims 2,000 % increase in Russian tr...",bowling,1.6,pentagon claims increase russian trolls bowli...,"[pentagon, claim, increase, russian, troll, bo...","[[-0.15722656, 0.095214844, 0.203125, 0.242187...","[-0.081390380859375, -0.002872467041015625, 0...."
2,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,1.0,iceland pm calls snap vote pedophile furor cra...,"[iceland, pm, call, snap, vote, pedophile, fur...","[[0.06298828, -0.012268066, 0.060302734, 0.185...","[0.04075792100694445, -0.04292127821180555, -0..."
3,"In an apparent first , Iran and Israel <engage...",slap,0.4,apparent first iran israel slap militarily,"[apparent, first, iran, israel, slap, militarily]","[[0.079589844, 0.10498047, -0.33398438, 0.1894...","[0.01416015625, -0.026163736979166668, 0.00514..."
5,All 22 <promises/> Trump made in his speech to...,sounds,1.2,sounds trump made speech congress one chart,"[sound, trump, made, speech, congress, one, ch...","[[-0.02746582, -0.0065612793, -0.122558594, -0...","[0.004237583705357143, -0.013179234095982142, ..."


### Split into X_train and y_train - by Kat

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['sentence embeddings'],
                                                    dataset['meanGrade'],
                                                    test_size=1/4, random_state=0)

X_train.head()

9092    [0.107666015625, 0.0622711181640625, 0.1437683...
2893    [-0.023319244384765625, 0.03894233703613281, 0...
2787    [-0.013985373757102272, 0.08225319602272728, 0...
8002    [0.08993094308035714, 0.10745675223214286, 0.1...
5057    [-0.05480194091796875, -0.02362060546875, -0.0...
Name: sentence embeddings, dtype: object

# Models

## LSTM Model - by Kat

In [6]:
from keras.preprocessing.sequence import pad_sequences

max_len = 300

X_train = pad_sequences(X_train,maxlen=max_len,padding='post' )
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

Using TensorFlow backend.


In [7]:
from keras import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout

model = Sequential()

#input dimension is length of word embedding
#output dimension is how many nodes on output layer
model.add(Embedding(input_dim=300, output_dim=1, input_length=max_len))

model.add(LSTM(units=300, dropout=0.3, recurrent_dropout=0.2))

model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [8]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 1)            300       
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               362400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 363,001
Trainable params: 363,001
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(optimizer='adam', loss='mse', metrics=['accuracy', 'mse'])

In [10]:
result= model.fit(X_train, y_train, batch_size=32, 
                  epochs=1, validation_data=(X_test, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 6846 samples, validate on 2283 samples
Epoch 1/1


In [15]:
import math
LSTM_RMSE = math.sqrt(result.history['mean_squared_error'][0])
print("LSTM RMSE base: {}".format(LSTM_RMSE))

LSTM RMSE base: 0.5576421632088139


## Bidirectional LSTM - by Kat and Humera

In [17]:
from keras import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional

biLSTM_model = Sequential()

biLSTM_model.add(Embedding(input_dim=300, output_dim=1, input_length=max_len))

biLSTM_model.add(Bidirectional(LSTM(units=300, dropout=0.3, recurrent_dropout=0.2)))

biLSTM_model.add(Dense(1, activation='sigmoid'))


In [18]:
biLSTM_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 300, 1)            300       
_________________________________________________________________
bidirectional_2 (Bidirection (None, 600)               724800    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 601       
Total params: 725,701
Trainable params: 725,701
Non-trainable params: 0
_________________________________________________________________


In [19]:
biLSTM_model.compile(optimizer='adam', loss='mse', metrics=['accuracy', 'mse'])

biLSTM_result = biLSTM_model.fit(X_train, y_train, batch_size=32, 
                  epochs=1, validation_data=(X_test, y_test))

Train on 6846 samples, validate on 2283 samples
Epoch 1/1


In [20]:
biLSTM_RMSE = math.sqrt(biLSTM_result.history['mean_squared_error'][0])
print("biLSTM RMSE base: {}".format(biLSTM_RMSE))

biLSTM RMSE base: 0.5575103067837445


## biLSTM - more layers - by Kat and Humera

In [21]:
from keras import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional

biLSTM_model_b = Sequential()
biLSTM_model_b.add(Embedding(input_dim=300, output_dim=1, input_length=max_len))
#input dimension is length of word embedding
#output dimension is how many nodes on output layer
biLSTM_model_b.add(Bidirectional(LSTM(units=300, dropout=0.3, recurrent_dropout=0.2)))

biLSTM_model_b.add(Dense(64, activation='sigmoid'))

biLSTM_model_b.add(Dense(1, activation='sigmoid'))

In [22]:
biLSTM_model_b.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 300, 1)            300       
_________________________________________________________________
bidirectional_3 (Bidirection (None, 600)               724800    
_________________________________________________________________
dense_4 (Dense)              (None, 64)                38464     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 763,629
Trainable params: 763,629
Non-trainable params: 0
_________________________________________________________________


In [23]:
biLSTM_model_b.compile(optimizer='adam', loss='mse', metrics=['accuracy', 'mse'])

biLSTM_result_b = biLSTM_model_b.fit(X_train, y_train, batch_size=32, 
                  epochs=1, validation_data=(X_test, y_test))

Train on 6846 samples, validate on 2283 samples
Epoch 1/1


In [24]:
biLSTM_RMSE_b = math.sqrt(biLSTM_result_b.history['mean_squared_error'][0])
print("biLSTM RMSE (additional layer): {}".format(biLSTM_RMSE_b))

biLSTM RMSE (additional layer): 0.5626690928067789


## biLSTM - RELU - by Kat, Varsha, Humera

In [25]:
from keras import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional

biLSTM_model_c = Sequential()

biLSTM_model_c.add(Embedding(input_dim=300, output_dim=1, input_length=max_len))

biLSTM_model_c.add(Bidirectional(LSTM(units=300, dropout=0.3, recurrent_dropout=0.2)))

biLSTM_model_c.add(Dense(1, activation='relu'))

In [26]:
biLSTM_model_c.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 300, 1)            300       
_________________________________________________________________
bidirectional_4 (Bidirection (None, 600)               724800    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 601       
Total params: 725,701
Trainable params: 725,701
Non-trainable params: 0
_________________________________________________________________


In [27]:
biLSTM_model_c.compile(optimizer='adam', loss='mse', metrics=['accuracy', 'mse'])

biLSTM_result_c = biLSTM_model_c.fit(X_train, y_train, batch_size=32, 
                  epochs=1, validation_data=(X_test, y_test))

Train on 6846 samples, validate on 2283 samples
Epoch 1/1


In [28]:
biLSTM_RMSE_c = math.sqrt(biLSTM_result_c.history['mean_squared_error'][0])
print("biLSTM RMSE (relu): {}".format(biLSTM_RMSE_c))

biLSTM RMSE (relu): 0.5817875898138096


## Linear Regression - by Kat

In [29]:
# Fitting Linear Regression to the dataset 
from sklearn.linear_model import LinearRegression 
lin = LinearRegression() 
  
lin.fit(X_train, y_train) 
Y_pred = lin.predict(X_test)  # make predictions

In [30]:
from sklearn.metrics import mean_squared_error

lin_reg_MSE = mean_squared_error(y_test, Y_pred)

In [31]:
lin_reg_RMSE = math.sqrt(lin_reg_MSE)
print("Linear Regression RMSE: {}".format(lin_reg_RMSE))

Linear Regression RMSE: 0.5571047393476176


## Comparison of Methods - by Kat

In [32]:
print("Linear Regression RMSE: {}".format(lin_reg_RMSE))
print("biLSTM RMSE (relu): {}".format(biLSTM_RMSE_c))
print("biLSTM RMSE (additional layer): {}".format(biLSTM_RMSE_b))
print("biLSTM RMSE base: {}".format(biLSTM_RMSE))
print("LSTM RMSE base: {}".format(LSTM_RMSE))

Linear Regression RMSE: 0.5571047393476176
biLSTM RMSE (relu): 0.5817875898138096
biLSTM RMSE (additional layer): 0.5626690928067789
biLSTM RMSE base: 0.5575103067837445
LSTM RMSE base: 0.5576421632088139


# Chosen Model - biLSTM - by Kat

We chose biLSTM to be our best model since its results are second best (by a small margin) to Linear Regression RMSE. While the results for the Linear Regression are slightly better, we believe there is more potential for hyperparameter tuning (past what we have done already), which is why we explore the biLSTM model more. 

Now we run on 5 epochs to improve results with some improved hyperparamters we found. Running on one epoch doesn't allow the model to reach toward its potential, but was what we viewed as most efficient for comparing initial results. The main hyper parameters we changed from our base biLSTM model were batch size (from 32 to 16) and epochs (for the reason mentioned above). Other hyperparameters were tested out on various machines (which is why they aren't run in this jupyter notebook), such as changing the optimizer (i.e. from adam optimizer to SGD), having a higher batch size (which gave higher RMSE), changing activation function (sigmoid vs tanh vs relu), and adding another layer. For the most part, our base model performed better than new models with the edits. 

In [35]:
from keras import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional

biLSTM_model_final = Sequential()

biLSTM_model_final.add(Embedding(input_dim=300, output_dim=1, input_length=max_len))

biLSTM_model_final.add(Bidirectional(LSTM(units=300, dropout=0.3, recurrent_dropout=0.2)))

biLSTM_model_final.add(Dense(1, activation='sigmoid'))

biLSTM_model_final.compile(optimizer='adam', loss='mse', metrics=['accuracy', 'mse'])

biLSTM_model_final.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 300, 1)            300       
_________________________________________________________________
bidirectional_6 (Bidirection (None, 600)               724800    
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 601       
Total params: 725,701
Trainable params: 725,701
Non-trainable params: 0
_________________________________________________________________


In [36]:
biLSTM_result_final = biLSTM_model_final.fit(X_train, y_train, batch_size=16, 
                  epochs=5, validation_data=(X_test, y_test))

Train on 6846 samples, validate on 2283 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Below are the scores from each epoch. Running each additional epoch does improve results, but only by an extremely small margin, which makes the additional resources and time past the second epoch not very beneficial while being very costly. Two epochs appears to be a good number for our purposes. 

In [37]:
print(biLSTM_result_final.history['mean_squared_error'])

[0.3100949605149273, 0.30615509604192287, 0.3061550952888113, 0.3061550942353258, 0.3061550947664218]


#### Cross Validation - biLSTM (2 epochs) - by Kat

In [38]:
data = {'X': dataset["sentence embeddings"], 'Y': dataset["meanGrade"]}

df_test = pd.DataFrame(data) 

In [46]:
## split data into 10 parts
## run for loop for each part

def biLSTM_ten_fold_cross_val(data):
    val_scores = []                            # array for validation scores
    
    shuffled = data.sample(frac=1)             # shuffle the data (to keep randomness)
    the_split = np.array_split(data, 10)       # split into ten parts. access each part through the_split[x]
    
    for i in range(0,10):
        print("biLSTM Validation {} of 10".format(i + 1))
        validation_set = the_split[i]
        train_frames = []
        for p in range(0,10):
            if p != i:
                train_frames.append(the_split[p])

        train_set = pd.concat(train_frames)    # this is the training set (with x and y) for this round.
        
        X_train = train_set["X"]
        Y_train = train_set["Y"]
        X_test = validation_set["X"]
        Y_test = validation_set["Y"]
        
        X_train = pad_sequences(X_train,maxlen=max_len,padding='post' )
        X_test = pad_sequences(X_test, maxlen=max_len, padding='post')
        
        model = Sequential()
        model.add(Embedding(input_dim=300, output_dim=1, input_length=max_len))
        model.add(Bidirectional(LSTM(units=300, dropout=0.3, recurrent_dropout=0.2)))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(optimizer='adam', loss='mse', metrics=['accuracy', 'mse'])

        result = model.fit(X_train, Y_train, batch_size=16, 
                      epochs=2, validation_data=(X_test, Y_test))
        
        val_scores.append(result.history['mean_squared_error'])
        
    return val_scores

In [47]:
biLSTM_cross_val = biLSTM_ten_fold_cross_val(df_test)

biLSTM Validation 1 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 2 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 3 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 4 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 5 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 6 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 7 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 8 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 9 of 10
Train on 8216 samples, validate on 913 samples
Epoch 1/2
Epoch 2/2
biLSTM Validation 10 of 10
Train on 8217 samples, validate on 912 samples
Epoch 1/2
Epoch 2/2


In [59]:
def printRMSE(mse):
    for i in range(0, len(mse)):
        print("Validation {}: {}, {}".format(i + 1, math.sqrt(mse[i][0]), math.sqrt(mse[i][1])))
            
            
printRMSE(biLSTM_cross_val)

Validation 1: 0.5573987624169929, 0.5557729861873885
Validation 2: 0.5547900375522811, 0.5531956939460426
Validation 3: 0.5581081808398746, 0.5547156774391875
Validation 4: 0.5569560534305721, 0.5550539641347731
Validation 5: 0.5573316405305709, 0.5544757990175536
Validation 6: 0.5575315204725305, 0.5551254392946691
Validation 7: 0.5574718322690859, 0.5544982908250331
Validation 8: 0.5555450724982951, 0.5529569491632386
Validation 9: 0.5547703419501918, 0.5526916471404962
Validation 10: 0.5564491083948483, 0.5543730390260023
