In [1]:
import pandas as pd
import numpy as np
from nltk import tokenize
from nltk.corpus import stopwords
import pickle
from scipy.sparse import hstack
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from utils import * 
import warnings
warnings.filterwarnings('ignore')

### merge data

In [2]:
train_bodies = pd.read_csv('./data/train_bodies.csv')
train = pd.read_csv('./data/train_stances.csv')

df_train = train.merge(train_bodies, on=['Body ID'], how='left')
df_train.head()

test_bodies = pd.read_csv('./data/competition_test_bodies.csv')
test = pd.read_csv('./data/competition_test_stances.csv')

df_test = test.merge(test_bodies, on=['Body ID'], how='left')
df_test.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated,A RESPECTED senior French police officer inves...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated,Dave Morin's social networking company Path is...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated,A bereaved Afghan mother took revenge on the T...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated,Hewlett-Packard is officially splitting in two...
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated,An airline passenger headed to Dallas was remo...


### split data

In [3]:
train_ids = [int(v) for v in open('./data/training_ids.txt').read().strip().split('\n')]
val_ids = [int(v) for v in open('./data/hold_out_ids.txt').read().strip().split('\n')]
        
df_train_a = df_train[df_train['Body ID'].isin(train_ids)]
df_train_b = df_train[df_train['Body ID'].isin(val_ids)]
print(df_train_a.shape)
print(df_train_b.shape)
print(df_train.shape)

(40350, 4)
(9622, 4)
(49972, 4)


### train a tfidf and serialize it

In [None]:
tfidf = TfidfVectorizer(tokenizer=tokenize.word_tokenize, max_df=0.8, min_df=5, max_features=10000, sublinear_tf=True, ngram_range=(1,3))
tfidf.fit(df_train['Headline'].tolist() + df_train['articleBody'].tolist() + df_test['Headline'].tolist() + df_test['articleBody'].tolist())
joblib.dump(tfidf, './tmp/tfidf_gram_1_3.pkl') 

In [None]:
tfidf = joblib.load('./tmp/tfidf_gram_1_3.pkl')
train_headline = tfidf.transform(df_train_a['Headline'].tolist())
train_body = tfidf.transform(df_train_a['articleBody'].tolist())

val_headline = tfidf.transform(df_train_b['Headline'].tolist())
val_body = tfidf.transform(df_train_b['articleBody'].tolist())

test_headline = tfidf.transform(df_test['Headline'].tolist())
test_body = tfidf.transform(df_test['articleBody'].tolist())

data = {'train_headline': train_headline, 'train_body':train_body, 'val_headline': val_headline, 'val_body': val_body, 
       'test_headline': test_headline, 'test_body': test_body}
pickle.dump(data, open('./tmp/headline_body_tfidf.pkl', 'wb'))

In [5]:
data = pickle.load(open('./tmp/headline_body_tfidf.pkl', 'rb'))
train_headline = data['train_headline']
train_body = data['train_body']

val_headline = data['val_headline']
val_body = data['val_body']

test_headline = data['test_headline']
test_body = data['test_body']

### map y from string to number

In [6]:
label_2_id = {'agree': 0, 'disagree': 1, 'discuss':2, 'unrelated': 3}
train_stance = df_train_a['Stance'].map(lambda x: label_2_id[x]).tolist()
val_stance = df_train_b['Stance'].map(lambda x: label_2_id[x]).tolist()
test_stance = df_test['Stance'].map(lambda x: label_2_id[x]).tolist()

### calcuate tfidf similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def get_sim_score(a, b):
    assert a.shape == b.shape
    res = []
    for row in range(a.shape[0]):
        cos = cosine_similarity(a[row], b[row])[0, 0]
        res.append(cos)
    return np.array(res).reshape(-1, 1)

train_sim = get_sim_score(train_headline, train_body)
val_sim = get_sim_score(val_headline, val_body)
test_sim = get_sim_score(test_headline, test_body)

tfidf_sim = {'train_sim': train_sim, 'val_sim': val_sim, 'test_sim': test_sim}
pickle.dump(tfidf_sim, open('./tmp/tfidf_sim.pkl', 'wb'))

In [7]:
tfidf_sim = pickle.load(open('./tmp/tfidf_sim.pkl', 'rb'))
train_sim = tfidf_sim['train_sim']
val_sim = tfidf_sim['val_sim']
test_sim = tfidf_sim['test_sim']

### features and labels

In [8]:
X_train = hstack((train_headline, train_body, train_sim))
y_train = np.array(train_stance)

X_val = hstack((val_headline, val_body, val_sim))
y_val = np.array(val_stance)

X_test = hstack((test_headline, test_body, test_sim))
y_test = np.array(test_stance)

### evaluate on the val set

In [10]:
clfs = [LinearSVC(), LogisticRegression(), MultinomialNB(), MLPClassifier(hidden_layer_sizes=(100, 4), early_stopping=True, max_iter=10)]

for clf in clfs:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    print('-' * 50)
    print(clf)
    print('classification report:')
    print(classification_report(y_val, y_pred))
    print('accuracy: %.3f' % accuracy_score(y_val, y_pred))
    print('macro f1: %.3f' % f1_score(y_val, y_pred, average='macro'))
    print('score: %.3f' % (get_score(y_val, y_pred) / get_score(y_val, y_val)))

--------------------------------------------------
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
classification report:
             precision    recall  f1-score   support

          0       0.73      0.58      0.65       762
          1       0.59      0.31      0.41       162
          2       0.80      0.84      0.82      1800
          3       0.97      0.99      0.98      6898

avg / total       0.91      0.92      0.91      9622

accuracy: 0.916
macro f1: 0.715
score: 0.862
--------------------------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
classification report:
      

### add RandomForest and LightGBM

In [14]:
clfs = [RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1, verbose=10, class_weight='balanced')]
clfs += [LGBMClassifier(n_estimators=100, learning_rate=0.1, silent=False, random_state=42)]

for clf in clfs:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    print('-' * 50)
    print(clf)
    print('classification report:')
    print(classification_report(y_val, y_pred))
    print('accuracy: %.3f' % accuracy_score(y_val, y_pred))
    print('macro f1: %.3f' % f1_score(y_val, y_pred, average='macro'))
    print('score: %.3f' % (get_score(y_val, y_pred) / get_score(y_val, y_val)))

building tree 1 of 50building tree 2 of 50building tree 3 of 50


building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   18.6s


building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   30.4s


building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   42.4s


building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   58.5s


building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min


building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min


building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.3s finished


--------------------------------------------------
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=42, verbose=10, warm_start=False)
classification report:
             precision    recall  f1-score   support

          0       0.53      0.41      0.46       762
          1       0.79      0.16      0.27       162
          2       0.72      0.77      0.74      1800
          3       0.92      0.95      0.93      6898

avg / total       0.85      0.86      0.85      9622

accuracy: 0.858
macro f1: 0.602
score: 0.780
--------------------------------------------------
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, m

### evaluate on the test set

In [12]:
clfs = [LinearSVC(), LogisticRegression(), MultinomialNB(), MLPClassifier(hidden_layer_sizes=(100, 4), early_stopping=True, max_iter=10)]
for clf in clfs:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print('-' * 50)
    print(clf)
    print('classification report: ')
    print(classification_report(y_test, y_pred))
    print('accuracy: %.3f' % accuracy_score(y_test, y_pred))
    print('macro fscore: %.3f' % f1_score(y_test, y_pred, average='macro'))
    print('score: %.3f' % (get_score(y_test, y_pred) / get_score(y_test, y_test)))  

--------------------------------------------------
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
classification report: 
             precision    recall  f1-score   support

          0       0.49      0.46      0.47      1903
          1       0.43      0.00      0.01       697
          2       0.74      0.73      0.73      4464
          3       0.94      0.99      0.96     18349

avg / total       0.86      0.88      0.86     25413

accuracy: 0.875
macro fscore: 0.544
score: 0.781
--------------------------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
classification report: 


### add RandomForest and LightGBM

In [15]:
clfs = [RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1, verbose=10, class_weight='balanced')]
clfs += [LGBMClassifier(n_estimators=100, learning_rate=0.1, silent=False, random_state=42)]

for clf in clfs:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print('-' * 50)
    print(clf)
    print('classification report: ')
    print(classification_report(y_test, y_pred))
    print('accuracy: %.3f' % accuracy_score(y_test, y_pred))
    print('macro fscore: %.3f' % f1_score(y_test, y_pred, average='macro'))
    print('score: %.3f' % (get_score(y_test, y_pred) / get_score(y_test, y_test)))

building tree 1 of 50building tree 3 of 50building tree 2 of 50building tree 4 of 50



building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   15.4s


building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   25.5s


building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   36.2s


building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   51.7s


building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min


building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min


building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.8min finished
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.0s finished


--------------------------------------------------
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=42, verbose=10, warm_start=False)
classification report: 
             precision    recall  f1-score   support

          0       0.50      0.27      0.35      1903
          1       0.00      0.00      0.00       697
          2       0.69      0.50      0.58      4464
          3       0.85      0.98      0.91     18349

avg / total       0.77      0.81      0.78     25413

accuracy: 0.813
macro fscore: 0.461
score: 0.644
--------------------------------------------------
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=