In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tqdm import tqdm_notebook as tqdm
from nltk import tokenize
from keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')
from utils import *
from sklearn.neural_network import MLPClassifier

### merge data

In [2]:
train_bodies = pd.read_csv('./data/train_bodies.csv')
train = pd.read_csv('./data/train_stances.csv')

df_train = train.merge(train_bodies, on=['Body ID'], how='left')
#df_train.head()

test_bodies = pd.read_csv('./data/competition_test_bodies.csv')
test = pd.read_csv('./data/competition_test_stances.csv')

df_test = test.merge(test_bodies, on=['Body ID'], how='left')
df_test.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated,A RESPECTED senior French police officer inves...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated,Dave Morin's social networking company Path is...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated,A bereaved Afghan mother took revenge on the T...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated,Hewlett-Packard is officially splitting in two...
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated,An airline passenger headed to Dallas was remo...


In [3]:
train_ids = [int(v) for v in open('./data/training_ids.txt').read().strip().split('\n')]
val_ids = [int(v) for v in open('./data/hold_out_ids.txt').read().strip().split('\n')]
        
df_train_a = df_train[df_train['Body ID'].isin(train_ids)]
df_train_b = df_train[df_train['Body ID'].isin(val_ids)]
print(df_train_a.shape)
print(df_train_b.shape)
print(df_train.shape)

(40350, 4)
(9622, 4)
(49972, 4)


In [4]:
word2vec, vecs = load_word2vec('./data/glove.6B.100d.txt')

In [5]:
def get_word2vec_mean(sent, word2vec):
    sent = [w.strip().lower() for w in tokenize.word_tokenize(sent)]
    #print(len(sent))
    sent = get_head_and_tail(sent)
    #print(sent, len(sent))
    vecs = np.array([word2vec[w] for w in sent if w in word2vec])
    if len(vecs) > 0:
        vecs = np.mean(vecs, axis=0)
    else:
        vecs = np.array([0] * 100)
    return vecs

In [24]:
train_headline_vec = [get_word2vec_mean(row, word2vec) for row in df_train_a['Headline'].tolist()]
train_body_vec = [get_word2vec_mean(row, word2vec) for row in df_train_a['articleBody'].tolist()]

val_headline_vec = [get_word2vec_mean(row, word2vec) for row in df_train_b['Headline'].tolist()]
val_body_vec = [get_word2vec_mean(row, word2vec) for row in df_train_b['articleBody'].tolist()]

test_headline_vec = [get_word2vec_mean(row, word2vec) for row in df_test['Headline'].tolist()]
test_body_vec = [get_word2vec_mean(row, word2vec) for row in df_test['articleBody'].tolist()]

### map y from string to number

In [9]:
label_2_id = {'agree': 0, 'disagree': 1, 'discuss':2, 'unrelated': 3}
train_stance = df_train_a['Stance'].map(lambda x: label_2_id[x]).tolist()
val_stance = df_train_b['Stance'].map(lambda x: label_2_id[x]).tolist()
test_stance = df_test['Stance'].map(lambda x: label_2_id[x]).tolist()

In [26]:
X_train = np.concatenate((np.array(train_headline_vec), np.array(train_body_vec)), axis=1)
y_train = np.array(train_stance)

X_val = np.concatenate((np.array(val_headline_vec), np.array(val_body_vec)), axis=1)
y_val = np.array(val_stance)

X_test = np.concatenate((np.array(test_headline_vec), np.array(test_body_vec)), axis=1)
y_test = np.array(test_stance)
X_train.shape

(40350, 200)

In [27]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 4), max_iter=30, random_state=42, verbose=True)
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_val)

print(classification_report(y_val, y_pred))
print('accuracy: %.3f' % accuracy_score(y_val, y_pred))
print('macro f1: %.3f' % f1_score(y_val, y_pred, average='macro'))
print('score: %.3f' % (get_score(y_val, y_pred) / get_score(y_val, y_val)))

Iteration 1, loss = 0.89974985
Iteration 2, loss = 0.65912192
Iteration 3, loss = 0.54831656
Iteration 4, loss = 0.47778836
Iteration 5, loss = 0.43165866
Iteration 6, loss = 0.39681980
Iteration 7, loss = 0.36789052
Iteration 8, loss = 0.35103896
Iteration 9, loss = 0.33033510
Iteration 10, loss = 0.31437489
Iteration 11, loss = 0.30085063
Iteration 12, loss = 0.28764652
Iteration 13, loss = 0.27635357
Iteration 14, loss = 0.26687505
Iteration 15, loss = 0.25787818
Iteration 16, loss = 0.24887153
Iteration 17, loss = 0.24186295
Iteration 18, loss = 0.23452927
Iteration 19, loss = 0.22461413
Iteration 20, loss = 0.21855029
Iteration 21, loss = 0.21260348
Iteration 22, loss = 0.20686145
Iteration 23, loss = 0.20789744
Iteration 24, loss = 0.19760742
Iteration 25, loss = 0.19143424
Iteration 26, loss = 0.18763388
Iteration 27, loss = 0.18550603
Iteration 28, loss = 0.17753694
Iteration 29, loss = 0.17316150
Iteration 30, loss = 0.17095040
             precision    recall  f1-score   supp

In [28]:
y_pred = mlp.predict(X_test)

print(classification_report(y_test, y_pred))
print('accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('macro fscore: %.3f' % f1_score(y_test, y_pred, average='macro'))
print('score: %.3f' % (get_score(y_test, y_pred) / get_score(y_test, y_test))) 

             precision    recall  f1-score   support

          0       0.38      0.42      0.40      1903
          1       0.17      0.02      0.03       697
          2       0.55      0.56      0.56      4464
          3       0.87      0.89      0.88     18349

avg / total       0.76      0.77      0.77     25413

accuracy: 0.774
macro fscore: 0.468
score: 0.666
