In [1]:
import pandas as pd
import numpy as np
from nltk import tokenize
from nltk.corpus import stopwords
import pickle
from scipy.sparse import hstack
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.svm import LinearSVC

### merge to generate data for training

In [2]:
train_bodies = pd.read_csv('train_bodies.csv')
train = pd.read_csv('train_stances.csv')

df_train = train.merge(train_bodies, on=['Body ID'], how='left')
df_train.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\r\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


In [3]:
test_bodies = pd.read_csv('competition_test_bodies.csv')
test = pd.read_csv('competition_test_stances.csv')

df_test = test.merge(test_bodies, on=['Body ID'], how='left')
df_test.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated,A RESPECTED senior French police officer inves...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated,Dave Morin's social networking company Path is...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated,A bereaved Afghan mother took revenge on the T...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated,Hewlett-Packard is officially splitting in two...
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated,An airline passenger headed to Dallas was remo...


### tfidf 

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(tokenizer=tokenize.word_tokenize, max_df=0.7, min_df=5, sublinear_tf=True)
#tfidf = TfidfVectorizer(tokenizer=tokenize.word_tokenize, max_df=0.8, min_df=5, max_features=10000, sublinear_tf=True, ngram_range=(1,3))
tfidf.fit(df_train['Headline'].tolist() + df_train['articleBody'].tolist() + df_test['Headline'].tolist() + df_test['articleBody'].tolist())

train_headline = tfidf.transform(df_train['Headline'].tolist())
train_body = tfidf.transform(df_train['articleBody'].tolist())

test_headline = tfidf.transform(df_test['Headline'].tolist())
test_body = tfidf.transform(df_test['articleBody'].tolist())

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


### save for future use

In [5]:
pickle.dump({'train_body': train_body, 'train_headline': train_headline, 'test_body': test_body, 'test_headline': test_headline}, 
            open('tfidf.pkl', 'wb'))

In [4]:
data = pickle.load(open('tfidf.pkl', 'rb'))
train_body = data['train_body']
train_headline = data['train_headline']
test_body = data['test_body']
test_headline = data['test_headline']

In [18]:
label_2_id = {'agree': 0, 'disagree': 1, 'discuss':2, 'unrelated': 3}
train_stance = df_train['Stance'].map(lambda x: label_2_id[x]).tolist()
test_stance = df_test['Stance'].map(lambda x: label_2_id[x]).tolist()

### generate features

#### simply concat tfidf vectors

In [None]:
X = hstack((headline, body))
y = np.array(stance)

#### the difference of two tfidf vectors

In [41]:
X = np.abs(headline - body)
y = np.array(stance)

In [19]:
X_train = hstack((np.abs(train_headline - train_body), train_headline.multiply(train_body)))
y_train = np.array(train_stance)

X_test = hstack((np.abs(test_headline - test_body), test_headline.multiply(test_body)))
y_test = np.array(test_stance)

### 10 fold cv

In [11]:
def my_scoring(estimator, X_test, y_true):
    y_pred = estimator.predict(X_test)
    assert len(y_true) == len(y_pred)
    num = len(y_true)
    score = 0
    for idx in range(num):
        true = y_true[idx]
        pred = y_pred[idx]
        if (true == 3) and (pred == 3):
            score += 1
        if (true in [1, 2, 0]) and (pred in [1, 2, 0]):
            score += 0.25
        if (true in [1, 2, 0]) and (pred in [1, 2, 0]) and (true == pred):
            score += 0.75
    return score/num

X_train_d = X_train.todense()
skf = StratifiedKFold(n_splits=10)
for train_index, val_index in skf.split(X_train_d, y_train):
    clf = LinearSVC()
    X_train1, y_train1 = X_train_d[train_index], y_train[train_index]
    X_val, y_val = X_train_d[val_index], y_train[val_index]
    clf.fit(X_train1, y_train1)
    y_pred = clf.predict(X_val)
    print('classification report:')
    print(classification_report(y_val, y_pred))
    print('acc:', accuracy_score(y_val, y_pred))
    print('customized score:', my_scoring(clf, X_val, y_val))
    print('macro f1:', f1_score(y_val, y_pred, average='macro'))

classification report:
             precision    recall  f1-score   support

          0       0.85      0.80      0.82       368
          1       0.71      0.49      0.58        84
          2       0.92      0.91      0.92       891
          3       0.98      0.99      0.98      3655

avg / total       0.95      0.96      0.95      4998

acc: 0.9551820728291317
customized score: 0.9606842737094838
macro f1: 0.8254221623881453
classification report:
             precision    recall  f1-score   support

          0       0.84      0.79      0.82       368
          1       0.64      0.45      0.53        84
          2       0.94      0.92      0.93       891
          3       0.98      0.99      0.99      3655

avg / total       0.95      0.96      0.96      4998

acc: 0.9569827931172469
customized score: 0.9625350140056023
macro f1: 0.8151340750997385
classification report:
             precision    recall  f1-score   support

          0       0.86      0.81      0.84       368
  

KeyboardInterrupt: 

### 10 fold cv using cross_val_score function

In [23]:
def my_scoring(estimator, X_test, y_true):
    y_pred = estimator.predict(X_test)
    assert len(y_true) == len(y_pred)
    num = len(y_true)
    score = 0
    for idx in range(num):
        true = y_true[idx]
        pred = y_pred[idx]
        if (true == 3) and (pred == 3):
            score += 0.25
        if (true in [1, 2, 0]) and (pred in [1, 2, 0]):
            score += 0.25
        if (true in [1, 2, 0]) and (pred in [1, 2, 0]) and (true == pred):
            score += 0.75
    return score/num

clf = LinearSVC()
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_macro', n_jobs=-1)
print(scores)
np.mean(scores)

[0.84141316 0.83110061 0.84845409 0.81232033 0.845517   0.83318983
 0.8346585  0.84996062 0.82040778 0.83773723]


0.8354759160067877

### evaluate the performance on the test set

In [21]:
final_clf = LinearSVC()
final_clf.fit(X_train, y_train)
y_pred = final_clf.predict(X_test)

print('classification report: ')
print(classification_report(y_test, y_pred))

print('-' * 50)
print('macro fscore: %.6f' % f1_score(y_test, y_pred, average='macro'))

print('-' * 50)
print('accuracy: %.6f' % accuracy_score(y_test, y_pred))

classification report: 
             precision    recall  f1-score   support

          0       0.15      0.44      0.23      1903
          1       0.16      0.03      0.05       697
          2       0.23      0.86      0.36      4464
          3       0.95      0.14      0.24     18349

avg / total       0.74      0.29      0.26     25413

--------------------------------------------------
macro fscore: 0.218089
--------------------------------------------------
accuracy: 0.285720
