In [1]:
import pandas as pd
import numpy as np
from nltk import tokenize
from nltk.corpus import stopwords
import pickle
from scipy.sparse import hstack
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

### merge data

In [2]:
train_bodies = pd.read_csv('train_bodies.csv')
train = pd.read_csv('train_stances.csv')

df_train = train.merge(train_bodies, on=['Body ID'], how='left')
df_train.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\r\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


In [3]:
test_bodies = pd.read_csv('competition_test_bodies.csv')
test = pd.read_csv('competition_test_stances.csv')

df_test = test.merge(test_bodies, on=['Body ID'], how='left')
df_test.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated,A RESPECTED senior French police officer inves...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated,Dave Morin's social networking company Path is...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated,A bereaved Afghan mother took revenge on the T...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated,Hewlett-Packard is officially splitting in two...
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated,An airline passenger headed to Dallas was remo...


### a better way to split data into train and val (based on Body ID)

In [4]:
# train_val_split = np.random.rand(max(df_train['Body ID'].tolist()) + 1)
# ratio = 0.3
# train_ids = []
# val_ids = []
# for idx, val in enumerate(train_val_split):
#     if val < ratio:
#         val_ids.append(idx)
#     else:
#         train_ids.append(idx)
train_ids = [int(v) for v in open('training_ids.txt').read().strip().split('\n')]
val_ids = [int(v) for v in open('hold_out_ids.txt').read().strip().split('\n')]
        
df_train_a = df_train[df_train['Body ID'].isin(train_ids)]
df_train_b = df_train[df_train['Body ID'].isin(val_ids)]
print(df_train_a.shape)
print(df_train_b.shape)
print(df_train.shape)

(40350, 4)
(9622, 4)
(49972, 4)


### train a tfidf and serialize it

In [7]:
tfidf = TfidfVectorizer(tokenizer=tokenize.word_tokenize, max_df=0.7, min_df=5, sublinear_tf=True)
#tfidf = TfidfVectorizer(tokenizer=tokenize.word_tokenize, max_df=0.8, min_df=5, max_features=10000, sublinear_tf=True, ngram_range=(1,3))
tfidf.fit(df_train['Headline'].tolist() + df_train['articleBody'].tolist() + df_test['Headline'].tolist() + df_test['articleBody'].tolist())
joblib.dump(tfidf, 'tfidf.pkl') 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function word_tokenize at 0x000001DB42A8F488>,
        use_idf=True, vocabulary=None)

In [5]:
tfidf = joblib.load('tfidf.pkl')
train_headline = tfidf.transform(df_train_a['Headline'].tolist())
train_body = tfidf.transform(df_train_a['articleBody'].tolist())

val_headline = tfidf.transform(df_train_b['Headline'].tolist())
val_body = tfidf.transform(df_train_b['articleBody'].tolist())

test_headline = tfidf.transform(df_test['Headline'].tolist())
test_body = tfidf.transform(df_test['articleBody'].tolist())

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


### map y from string to number

In [6]:
label_2_id = {'agree': 0, 'disagree': 1, 'discuss':2, 'unrelated': 3}
train_stance = df_train_a['Stance'].map(lambda x: label_2_id[x]).tolist()
val_stance = df_train_b['Stance'].map(lambda x: label_2_id[x]).tolist()
test_stance = df_test['Stance'].map(lambda x: label_2_id[x]).tolist()

### train with the following features will generate a big gap between val score and test score

In [21]:
X_train = hstack((np.abs(train_headline - train_body), train_headline.multiply(train_body)))
y_train = np.array(train_stance)

X_val = hstack((np.abs(val_headline - val_body), val_headline.multiply(val_body)))
y_val = np.array(val_stance)

X_test = hstack((np.abs(test_headline - test_body), test_headline.multiply(test_body)))
y_test = np.array(test_stance)

### calcuate tfidf similarity

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
def get_sim_score(a, b):
    assert a.shape == b.shape
    res = []
    for row in range(a.shape[0]):
        cos = cosine_similarity(a[row], b[row])[0, 0]
        res.append(cos)
    return np.array(res).reshape(-1, 1)

train_sim = get_sim_score(train_headline, train_body)
val_sim = get_sim_score(val_headline, val_body)
test_sim = get_sim_score(test_headline, test_body)

In [12]:
train_sim

array([[0.31014083],
       [0.02982301],
       [0.01764494],
       ...,
       [0.00415531],
       [0.48229947],
       [0.00675724]])

In [13]:
X_train = hstack((train_headline, train_body, train_sim))
y_train = np.array(train_stance)

X_val = hstack((val_headline, val_body, val_sim))
y_val = np.array(val_stance)

X_test = hstack((test_headline, test_body, test_sim))
y_test = np.array(test_stance)

### evaluate on the val set

In [14]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print('classification report:')
print(classification_report(y_val, y_pred))
print('acc:', accuracy_score(y_val, y_pred))
print('macro f1:', f1_score(y_val, y_pred, average='macro'))

classification report:
             precision    recall  f1-score   support

          0       0.74      0.63      0.68       762
          1       0.65      0.36      0.47       162
          2       0.83      0.85      0.84      1800
          3       0.97      0.99      0.98      6898

avg / total       0.92      0.92      0.92      9622

acc: 0.9240282685512368
macro f1: 0.7402175501165529


### evaluate on the test set

In [15]:
final_clf = LinearSVC()
final_clf.fit(X_train, y_train)
y_pred = final_clf.predict(X_test)

print('classification report: ')
print(classification_report(y_test, y_pred))

print('-' * 50)
print('macro fscore: %.6f' % f1_score(y_test, y_pred, average='macro'))

print('-' * 50)
print('accuracy: %.6f' % accuracy_score(y_test, y_pred))

classification report: 
             precision    recall  f1-score   support

          0       0.54      0.51      0.53      1903
          1       0.50      0.01      0.01       697
          2       0.77      0.74      0.75      4464
          3       0.94      0.99      0.97     18349

avg / total       0.87      0.88      0.87     25413

--------------------------------------------------
macro fscore: 0.564919
--------------------------------------------------
accuracy: 0.884744
