In [14]:
import pandas as pd
import numpy as np
from nltk import tokenize
from nltk.corpus import stopwords
import pickle
from scipy.sparse import hstack
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

### merge data

In [2]:
train_bodies = pd.read_csv('train_bodies.csv')
train = pd.read_csv('train_stances.csv')

df_train = train.merge(train_bodies, on=['Body ID'], how='left')
df_train.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\r\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


In [3]:
test_bodies = pd.read_csv('competition_test_bodies.csv')
test = pd.read_csv('competition_test_stances.csv')

df_test = test.merge(test_bodies, on=['Body ID'], how='left')
df_test.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated,A RESPECTED senior French police officer inves...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated,Dave Morin's social networking company Path is...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated,A bereaved Afghan mother took revenge on the T...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated,Hewlett-Packard is officially splitting in two...
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated,An airline passenger headed to Dallas was remo...


### split data

In [4]:
train_ids = [int(v) for v in open('training_ids.txt').read().strip().split('\n')]
val_ids = [int(v) for v in open('hold_out_ids.txt').read().strip().split('\n')]
        
df_train_a = df_train[df_train['Body ID'].isin(train_ids)]
df_train_b = df_train[df_train['Body ID'].isin(val_ids)]
print(df_train_a.shape)
print(df_train_b.shape)
print(df_train.shape)

(40350, 4)
(9622, 4)
(49972, 4)


### train a tfidf and serialize it

In [5]:
tfidf = TfidfVectorizer(tokenizer=tokenize.word_tokenize, max_df=0.8, min_df=5, max_features=10000, sublinear_tf=True, ngram_range=(1,3))
tfidf.fit(df_train['Headline'].tolist() + df_train['articleBody'].tolist() + df_test['Headline'].tolist() + df_test['articleBody'].tolist())
joblib.dump(tfidf, 'tfidf_gram_1_3.pkl') 

['tfidf_gram_1_3.pkl']

In [6]:
tfidf = joblib.load('tfidf_gram_1_3.pkl')
train_headline = tfidf.transform(df_train_a['Headline'].tolist())
train_body = tfidf.transform(df_train_a['articleBody'].tolist())

val_headline = tfidf.transform(df_train_b['Headline'].tolist())
val_body = tfidf.transform(df_train_b['articleBody'].tolist())

test_headline = tfidf.transform(df_test['Headline'].tolist())
test_body = tfidf.transform(df_test['articleBody'].tolist())

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


### map y from string to number

In [7]:
label_2_id = {'agree': 0, 'disagree': 1, 'discuss':2, 'unrelated': 3}
train_stance = df_train_a['Stance'].map(lambda x: label_2_id[x]).tolist()
val_stance = df_train_b['Stance'].map(lambda x: label_2_id[x]).tolist()
test_stance = df_test['Stance'].map(lambda x: label_2_id[x]).tolist()

### calcuate tfidf similarity

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
def get_sim_score(a, b):
    assert a.shape == b.shape
    res = []
    for row in range(a.shape[0]):
        cos = cosine_similarity(a[row], b[row])[0, 0]
        res.append(cos)
    return np.array(res).reshape(-1, 1)

train_sim = get_sim_score(train_headline, train_body)
val_sim = get_sim_score(val_headline, val_body)
test_sim = get_sim_score(test_headline, test_body)

In [9]:
X_train = hstack((train_headline, train_body, train_sim))
y_train = np.array(train_stance)

X_val = hstack((val_headline, val_body, val_sim))
y_val = np.array(val_stance)

X_test = hstack((test_headline, test_body, test_sim))
y_test = np.array(test_stance)

### evaluate on the val set

In [21]:
clfs = [LinearSVC(), LogisticRegression(), MultinomialNB(), MLPClassifier(hidden_layer_sizes=(100, 4), early_stopping=True, max_iter=10)]

for clf in clfs:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    print('-' * 50)
    print(clf)
    print('classification report:')
    print(classification_report(y_val, y_pred))
    print('accuracy:', accuracy_score(y_val, y_pred))
    print('macro f1:', f1_score(y_val, y_pred, average='macro'))

--------------------------------------------------
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
classification report:
             precision    recall  f1-score   support

          0       0.73      0.58      0.65       762
          1       0.59      0.31      0.41       162
          2       0.80      0.84      0.82      1800
          3       0.97      0.99      0.98      6898

avg / total       0.91      0.92      0.91      9622

accuracy: 0.9163375597588859
macro f1: 0.7146122848263636
--------------------------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
classification r



--------------------------------------------------
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=10, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
classification report:
             precision    recall  f1-score   support

          0       0.67      0.59      0.63       762
          1       0.00      0.00      0.00       162
          2       0.76      0.91      0.83      1800
          3       0.98      0.97      0.98      6898

avg / total       0.90      0.91      0.90      9622

accuracy: 0.9112450633963833
macro f1: 0.6080725937389633


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### evaluate on the test set

In [22]:
for clf in clfs:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print('-' * 50)
    print(clf)
    print('classification report: ')
    print(classification_report(y_test, y_pred))
    print('accuracy: %.6f' % accuracy_score(y_test, y_pred))
    print('macro fscore: %.6f' % f1_score(y_test, y_pred, average='macro'))

--------------------------------------------------
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
classification report: 
             precision    recall  f1-score   support

          0       0.49      0.46      0.47      1903
          1       0.43      0.00      0.01       697
          2       0.74      0.73      0.73      4464
          3       0.94      0.99      0.96     18349

avg / total       0.86      0.88      0.86     25413

accuracy: 0.875221
macro fscore: 0.544167
--------------------------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
classification report: 
       

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


--------------------------------------------------
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
classification report: 
             precision    recall  f1-score   support

          0       0.12      0.56      0.20      1903
          1       0.06      0.10      0.07       697
          2       0.29      0.31      0.30      4464
          3       0.80      0.45      0.57     18349

avg / total       0.64      0.42      0.48     25413

accuracy: 0.421949
macro fscore: 0.285055




--------------------------------------------------
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=10, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
classification report: 
             precision    recall  f1-score   support

          0       0.44      0.45      0.45      1903
          1       0.00      0.00      0.00       697
          2       0.62      0.70      0.66      4464
          3       0.93      0.93      0.93     18349

avg / total       0.81      0.83      0.82     25413

accuracy: 0.829812
macro fscore: 0.508533


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
