In [45]:
import sys

#For parsing and visualizing data
from pandas import DataFrame, read_csv
import pandas as pd

#For visualizing data
import matplotlib.pyplot as plt

#For processing data
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

#Feature Engineering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
# import baseline_features
from sklearn.decomposition import TruncatedSVD

#Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

#For scoring
from sklearn.metrics import accuracy_score
# import score #Score used in competition

#Progress Bar
from tqdm import tqdm

import re

In [46]:
import nltk

nltk.download("stopwords")
nltk.download("punkt")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Read Files

In [47]:
train_df = pd.read_csv('/content/lemmatized_dataset_final_balanced_train.csv')

In [48]:
validate_df = pd.read_csv('/content/lemmatized_dataset_final_balanced_validation.csv')

In [49]:
test_df = pd.read_csv('/content/lemmatized_dataset_final_balanced_test.csv')

In [50]:
train_df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,stance_cat
0,dna test confirm lebanon is holding isi leader...,2042,unrelated,there is a story currently making the round ab...,3
1,somalia shebab chief ahmed abdi godane likely ...,1610,discuss,ahmed abdi godane the leader of al shabab the ...,2
2,dna test prove lebanon is holding isi chief al...,1468,disagree,an iraqi official denied that a woman detained...,1
3,the pumpkinspice condom is just a figment of y...,1253,unrelated,the united state department of defense said on...,3
4,u probing claim isi fighter seized airdropped ...,465,discuss,the pentagon admitted on wednesday that isi di...,2


## Feature Engineering





### TF-IDF Features

In [51]:
#Apply Scikit Learn TFIDF Feature Extraction Algorithm
body_text_vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english',max_features=1024)
headline_vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english',max_features=1024)

#Create vocabulary based on training data
train_body_tfidf = body_text_vectorizer.fit_transform(train_df['articleBody'])
train_headline_tfidf = headline_vectorizer.fit_transform(train_df['Headline'])

#Create vocabulary based on validation data
validate_body_tfidf = body_text_vectorizer.transform(validate_df['articleBody'])
validate_headline_tfidf = headline_vectorizer.transform(validate_df['Headline'])

#Use vocabulary for testing data
test_body_tfidf = body_text_vectorizer.transform(test_df['articleBody'])
test_headline_tfidf = headline_vectorizer.transform(test_df['Headline'])

### Cosine Similarity Features

In [52]:
#Cosine Similarity
def get_cosine_similarity(body_tfidf,headline_tfidf):
    cosine_features = []
    #len body_tfidf = len headline_tfidf
    for i in tqdm(range(body_tfidf.shape[0])):
        cosine_features.append(cosine_similarity((body_tfidf.A[0].reshape(1,-1)),(headline_tfidf.A[0].reshape(1,-1)))[0][0])
    return np.array(cosine_features).reshape(body_tfidf.shape[0],1)

In [53]:
#Leave this commented out unless you are re-calculating the cosine similarity
#which can be found in the pickle files labeled:
#train_cosine_features.p and test_cosine_features.p

train_cosine_features = get_cosine_similarity(train_body_tfidf,train_headline_tfidf)

validate_cosine_features = get_cosine_similarity(validate_body_tfidf,validate_headline_tfidf)

test_cosine_features = get_cosine_similarity(test_body_tfidf,test_headline_tfidf)

pickle.dump(train_cosine_features,open('train_cosine_features.p','wb'))
pickle.dump(validate_cosine_features,open('validate_cosine_features.p','wb'))
pickle.dump(test_cosine_features,open('test_cosine_features.p','wb'))

100%|██████████| 21483/21483 [2:10:58<00:00,  2.73it/s]
100%|██████████| 2686/2686 [00:17<00:00, 154.82it/s]
100%|██████████| 2685/2685 [00:17<00:00, 154.91it/s]


In [54]:
train_cosine_features = pickle.load(open('train_cosine_features.p','rb'))
validate_cosine_features = pickle.load(open('validate_cosine_features.p','rb'))
test_cosine_features = pickle.load(open('test_cosine_features.p','rb'))

### Classification Feature Vectors


In [75]:
train_features = hstack([
                            train_body_tfidf,
                            train_headline_tfidf,
                            train_cosine_features
                        ])
validate_features = hstack([
                            validate_body_tfidf,
                            validate_headline_tfidf,
                            validate_cosine_features
                        ])
test_features = hstack([
                            test_body_tfidf,
                            test_headline_tfidf,
                            test_cosine_features
                        ])

## Classification

In [76]:
test_labels = list(test_df['Stance'])

In [77]:
train_labels = list(train_df['Stance'])

### Run Classifiers and Score Validation Output

In [78]:

LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]


def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1

    return score, cm

def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))

def report_score(actual,predicted):
    score,cm = score_submission(actual,predicted)
    best_score, _ = score_submission(actual,actual)

    print_confusion_matrix(cm)
    print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(score*100/best_score) + "%)")
    return score*100/best_score

In [79]:
names = ["Random Forest", "Multinomial Naive Bayes", "Gradient Boosting","Linear SVM", "Decision Tree", "Logistic Regression"]
# "K Nearest Neighbors",
classifiers = [
    RandomForestClassifier(n_estimators=10),
    MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
    GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=5),
    LogisticRegression(C=1e5)
]
# KNeighborsClassifier(4, algorithm='kd_tree'),
for n, clf in zip(names, classifiers):
    print(n)
    y_pred = clf.fit(train_features,train_labels).predict(validate_features)
    print(report_score(test_labels, y_pred))
    print('\n')

Random Forest
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    47     |     4     |    128    |    189    |
-------------------------------------------------------------
| disagree  |    14     |     0     |    39     |    31     |
-------------------------------------------------------------
|  discuss  |    135    |    20     |    300    |    436    |
-------------------------------------------------------------
| unrelated |    202    |    36     |    415    |    689    |
-------------------------------------------------------------
Score: 604.25 out of 1678.5	(35.99940422996723%)
35.99940422996723


Multinomial Naive Bayes
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    47     |

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Run Classifiers and Score Test Output

In [80]:
#This is how well we would have scored in the actual competition
names = ["Random Forest", "Multinomial Naive Bayes", "Gradient Boosting","K Nearest Neighbors","Linear SVM", "Decision Tree", "Logistic Regression"]

classifiers = [
    RandomForestClassifier(n_estimators=10),
    MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
    GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True),
    KNeighborsClassifier(4),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=5),
    LogisticRegression(C=1e5)
]

for n, clf in zip(names, classifiers):
    print(n)
    y_pred = clf.fit(train_features,train_labels).predict(validate_features)
    print(report_score(test_labels, y_pred))
    print('\n')

Random Forest
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    47     |     5     |    132    |    184    |
-------------------------------------------------------------
| disagree  |    13     |     1     |    36     |    34     |
-------------------------------------------------------------
|  discuss  |    151    |    17     |    290    |    433    |
-------------------------------------------------------------
| unrelated |    188    |    34     |    429    |    691    |
-------------------------------------------------------------
Score: 599.25 out of 1678.5	(35.70151921358356%)
35.70151921358356


Multinomial Naive Bayes
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    47     |

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
