In [25]:

import pandas as pd
import gensim
import gensim.downloader as api
import numpy as np

import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
wv = api.load('word2vec-google-news-300')

In [6]:
data_path = './ALL_data/'

train = pd.read_csv(data_path+'Training_Data/subtaskA_data_all.csv')
train_labels = list(pd.read_csv(data_path+'Training_Data/subtaskA_answers_all.csv', header=None, names=['id', 'label'])['label'])
dev = pd.read_csv(data_path+'Dev_Data/subtaskA_dev_data.csv')
dev_labels = list(pd.read_csv(data_path+'Dev_Data/subtaskA_gold_answers.csv', header=None, names=['id', 'label'])['label'])
test = pd.read_csv(data_path+'Test_Data/subtaskA_test_data.csv')
test_labels = list(pd.read_csv(data_path+'Test_Data/subtaskA_gold_answers.csv', header=None, names=['id', 'label'])['label'])


In [7]:
def word_average(sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in wv:
				mean.append(wv[word])

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			return np.zeros(300)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


In [22]:
# training data
X = []
for sentence in train['sent0']:
    X.append(word_average(gensim.utils.simple_preprocess(sentence)))
for sentence in train['sent1']:
    X.append(word_average(gensim.utils.simple_preprocess(sentence)))

y = train_labels + [1-i for i in train_labels]

# dev data
X_dev = []
for sentence in dev['sent0']:
    X_dev.append(word_average(gensim.utils.simple_preprocess(sentence)))
for sentence in dev['sent1']:
    X_dev.append(word_average(gensim.utils.simple_preprocess(sentence)))

y_dev = dev_labels + [1-i for i in dev_labels]


# test data
X_test = []
for sentence in test['sent0']:
    X_test.append(word_average(gensim.utils.simple_preprocess(sentence)))
for sentence in test['sent1']:
    X_test.append(word_average(gensim.utils.simple_preprocess(sentence)))

y_test = test_labels + [1-i for i in test_labels]

In [27]:
def get_acc_f1(preds, acc, f1):
        correct, total = 0, 0
        for pred, label in zip(preds, y_dev):
            if pred == label: correct += 1
            total += 1
        acc.append(1.0*correct/total)
        f1.append(metrics.f1_score(y_dev, preds, average='weighted'))
        return acc, f1

def print_results(classifier, parameter, param_vals, acc, f1):
        # use the highest f1-score to determine the optimal parameter value
        print('{} has an accuracy of {:.2f}% and f1-score of {:.2f}% with an optimal {} value of {}.\n'
              .format(classifier, acc[f1.index(max(f1))]*100,max(f1)*100, parameter, param_vals[f1.index(max(f1))]))
        
    
# function to fit Logistic Regression models with varying C values & compute sum of square of the weights
def build_lr():
    c = [1.0e-10,0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 10]
    acc = []
    f1 = []
    ss = []
    print("Effects of C regularization on sum of square weights:")
    for i in c:
        lr = LogisticRegression(solver='liblinear', C=i)
        lr.fit(X, y)
        preds = lr.predict(X_dev)
        get_acc_f1(preds, acc, f1)
        # extract the weights from .coef_ and print their sum of squares
        ss.append(sum(np.square(lr.coef_).T))
        print('C={}, SS={}'.format(i,sum(np.square(lr.coef_).T)))
    print('\n')
    print_results('Logistic Regression', 'C', c, acc, f1)
build_lr()

Effects of C regularization on sum of square weights:
C=1e-10, SS=[1.7548612e-16]
C=0.0001, SS=[0.00017158]
C=0.001, SS=[0.01460161]
C=0.01, SS=[0.64332863]
C=0.1, SS=[9.00187405]
C=0.2, SS=[15.16480324]
C=0.3, SS=[19.30381696]
C=0.4, SS=[22.28672104]
C=0.5, SS=[24.54491173]
C=0.6, SS=[26.32012554]
C=0.7, SS=[27.75273617]
C=0.8, SS=[28.93541552]
C=0.9, SS=[29.92976753]
C=1, SS=[30.77716233]
C=10, SS=[40.16302273]


Logistic Regression has an accuracy of 56.52% and f1-score of 56.52% with an optimal C value of 0.5.



In [44]:
# LOGISTIC REGRESSION WITH PERCENTAGES - USE OPTIMAL C VALUE OF 0.5


# lr = LogisticRegression(solver='liblinear', C=0.5)
lr.fit(X, y)
preds = lr.predict(X_dev)

correct, total = 0, 0
for pred, label in zip(preds, y_dev):
    if pred == label: correct += 1
    total += 1
acc = 1.0*correct/total

print(f'LR accuracy = {acc}')


probs = lr.predict_proba(X_dev)
guesses = []
for i in range(997):
    if (probs[i][0] > probs[i+997][0]) and y_dev[i]==0:
        guesses.append(1)
    elif (probs[i][0] < probs[i+997][0]) and y_dev[i]==1:
        guesses.append(1)
    else:
        guesses.append(0)
print(f'model accuracy = {sum(guesses)/997}')

LR accuracy = 0.5651955867602808
model accuracy = 0.5827482447342026


In [47]:
# LOGISTIC REGRESSION WITH PERCENTAGES - USE OPTIMAL C VALUE OF 0.5


# lr = LogisticRegression(solver='liblinear', C=0.5)
lr.fit(X, y)
preds = lr.predict(X_test)

correct, total = 0, 0
for pred, label in zip(preds, y_test):
    if pred == label: correct += 1
    total += 1
acc = 1.0*correct/total

print(f'LR accuracy = {acc}')


probs = lr.predict_proba(X_test)
guesses = []
for i in range(1000):
    if (probs[i][0] > probs[i+1000][0]) and y_test[i]==0:
        guesses.append(1)
    elif (probs[i][0] < probs[i+1000][0]) and y_test[i]==1:
        guesses.append(1)
    else:
        guesses.append(0)
print(f'model accuracy = {sum(guesses)/1000}')

LR accuracy = 0.559
model accuracy = 0.615
