In [None]:
import numpy as np
import pandas as pd

In [None]:
''' Globals '''
MAXBODYSIZE = 500
MAXHEADSIZE = 50
EMBEDDINGDIM = 300
Stances = {'agree', 'disgree', 'discuss', 'unrelated'}

In [None]:
''' Load data sets '''
trainBodiesDF = pd.read_csv('./DefaultFiles/train_bodies.csv')
trainHeadDF = pd.read_csv('./DefaultFiles/train_stances.csv')
testBodiesDF = pd.read_csv('./DefaultFiles/test_bodies.csv')
testHeadDF = pd.read_csv('./DefaultFiles/test_stances_unlabeled.csv')


In [None]:
''' 
    Cleaning 
    - drop heads with no reference body
    - drop null heads
    - reset indexes to accomodate change
'''
totalTrain = pd.merge(trainBodiesDF, trainHeadDF, on='Body ID')
trainBodiesDF = totalTrain.groupby('Body ID').first()[['articleBody']]
trainHeadDF = totalTrain[['Body ID','Headline','Stance']]
trainHeadDF = trainHeadDF.dropna()
trainBodiesDF.reset_index(inplace=True)
trainHeadDF.reset_index(inplace=True)
print(trainBodiesDF.head(3))
print(trainHeadDF.head(3))
print(testBodiesDF.head(3))
print(testHeadDF.head(3))

In [None]:
'''
    Load Pretrained Word2Vec by Google
    Word2Vec is a shallow neural network ot produce word embeddings
    The primary goal is vectorize the linguistic context of the word
    You can download from here:
    https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
'''
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec

word2Vec = KeyedVectors.load_word2vec_format('GensimVectors/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
'''
    For downloading for nltk
    import 
    on first time download the following packages

    nltk.download()
    select d
    download packages ['punkt', wordnet', 'stopwords']
'''
import nltk

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
import re

'''
    Processing text
    1. Split into words i.e [[word,word],[word,word,word]]
    2. Stem - chop of ends
    3. Lemmatise - remove inflection endings and return to base citionary
    4. remove stopwards
    5. only take words containing only letters and contained in Word2Vec vocab
'''
def process( text):
    out = []
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    outout = []
    for word in word_tokenize(text):
        word = word.strip().lower()
        word = stemmer.stem(word)
        word = lemmatizer.lemmatize(word, wordnet.VERB)
        # major speed gain only testing for letters
        word = word.replace("n't", 'not')
        word = word.replace("'m", 'am')
        word = word.replace("'ve'", 'have')
        word = word.replace("'d", 'would')
        word = word.replace("'ll", "will")
        if word != '' and word.isalpha() and word in word2Vec:
            out.append(word.lower())
    return out

In [None]:
'''
    Loop through all four data frames and process the text
    ~ Will take approximately 2 minutes
'''
for index, row in trainBodiesDF.iterrows():
    trainBodiesDF.iat[index, trainBodiesDF.columns.get_loc("articleBody")] = " ".join(process(row['articleBody']))
for index, row in trainHeadDF.iterrows():
    trainHeadDF.iat[index, trainHeadDF.columns.get_loc("Headline")] = " ".join(process(row['Headline']))
for index, row in testBodiesDF.iterrows():
    testBodiesDF.iat[index, testBodiesDF.columns.get_loc("articleBody")] = " ".join(process(row['articleBody']))
for index, row in testHeadDF.iterrows():
    testHeadDF.iat[index, testHeadDF.columns.get_loc("Headline")] = " ".join(process(row['Headline']))

In [None]:
print(trainBodiesDF.head(3))
print(trainHeadDF.head(3))
print(testBodiesDF.head(3))
print(testHeadDF.head(3))

In [None]:
''' Save a checkpoint '''
trainBodiesDF.to_csv('ProcessedTrainBodies.csv',index=False)
trainHeadDF.to_csv('ProcessedTrainHead.csv',index=False)
testBodiesDF.to_csv('ProcessedTestBodies.csv',index=False)
testHeadDF.to_csv('ProcessedTestHead.csv',index=False)
print(trainBodiesDF.shape, trainHeadDF.shape, testBodiesDF.shape, testHeadDF.shape, )

In [None]:
'''
    Create and train tokenizer
    Tokenizer is utilised to create numerical representations of the data
'''
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences


totalText = []
for index, row in trainBodiesDF.iterrows():
    totalText.append(row['articleBody'])
for index, row in trainHeadDF.iterrows():
    totalText.append(row['Headline'])
for index, row in testBodiesDF.iterrows():
    totalText.append(row['articleBody'])
for index, row in testHeadDF.iterrows():
    totalText.append(row['Headline'])
    
tokenizer = Tokenizer()
tokenizer.fit_on_texts(totalText)
wordIndexs = tokenizer.word_index
vocabSize = tokenizer.word_counts
print('Vocab Size: ',len(wordIndexs))

In [None]:
'''
    utilise tokenizer and save word representations
'''
wordIndexsdf = pd.DataFrame.from_dict(wordIndexs, orient='index')
wordIndexsdf.to_csv('wordIndexs.csv',index=False)
wordIndexsdf.head(5)

In [None]:
embeddingVector = {}
for word, index in wordIndexs.items():
    if word != '':
        embeddingVector[index] = word2Vec[word]
embeddingdf = pd.DataFrame.from_dict(embeddingVector, orient='index')
embeddingdf.to_csv('embeddingVectors.csv',index=False)
embeddingdf.head(5)

In [None]:
embeddingMatrix = embeddingdf.to_numpy()
embeddingMatrix[5:]

In [None]:
''' 
    Loaded Function
    Purposes
    - Change pandas dataframe to trainable / testable numpy data
    - texts to sequences - convert words into their appropriate numerical representation
    - pad_sequences - convert all vectors into desired length (increase / decrease size)
    - for train data - convert stances into numerical representation
'''
def CreateNetworkData(bodydf, headdf, stance):
    heads = []
    bodies = []
    stances = []
    stancesLookup = {'unrelated': 0 , 'agree':1, 'disagree':2, 'discuss':3}
    for index, row in headdf.iterrows():
        # don't drop rows in test
        if not stance:
            if pd.isna(row['Headline']):
                heads.append([])
            else:
                heads.append(row['Headline'].split(" "))
            try:
                bodies.append(bodydf.loc[bodydf['Body ID'] == int(row['Body ID'])].iloc[0]['articleBody'][0].split(" "))
            except Exception:
                print(bodydf.loc[bodydf['Body ID'] == int(row['Body ID'])].iloc[0]['articleBody'])
            if stance:
                stances.append(stancesLookup[row['Stance'].strip()])
        else:
            if not pd.isna(row['Headline']):
                heads.append(row['Headline'].split(" "))
                bodies.append(bodydf.loc[bodydf['Body ID'] == int(row['Body ID'])].iloc[0]['articleBody'][0].split(" "))
                if stance:
                    stances.append(stancesLookup[row['Stance'].strip()])
    heads = tokenizer.texts_to_sequences(heads)
    bodies = tokenizer.texts_to_sequences(bodies)
    heads = pad_sequences(heads,maxlen = MAXHEADSIZE,padding = 'post')
    bodies = pad_sequences(bodies,maxlen = MAXBODYSIZE,padding = 'post')
    if stance:
        stances = to_categorical(stances, num_classes=4)
    return heads,bodies,stances

In [None]:
'''
    Create data structures for lstm nework
'''
trainHeads,trainBodies,trainStances = CreateNetworkData(trainBodiesDF, trainHeadDF, True)

In [None]:
import keras

from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, Input
from keras.layers.wrappers import Bidirectional
from keras.layers.recurrent import LSTM 
from keras.layers import concatenate 
from keras.preprocessing import sequence


In [None]:
'''
    Bidirectional LSTM used
    inputs are concatenated and feed into a two layer dense network with dropout
    Please refer to report for further information about method
'''

InputHead = Input(shape=(MAXHEADSIZE,), dtype='int32', name='InputHead')
InputBody = Input(shape=(MAXBODYSIZE,), dtype='int32', name='InputBody')
Embeddings = Embedding(len(wordIndexs), EMBEDDINGDIM, weights=[embeddingMatrix],trainable=False)
EmbedHead = Embeddings(InputHead)
EmbedBody = Embeddings(InputBody)

LSTMHead = Bidirectional(LSTM(64,dropout=0.2, recurrent_dropout=0.2, name='LSTMHead'))(EmbedHead)
LSTMBody = Bidirectional(LSTM(64,dropout=0.2, recurrent_dropout=0.2, name='LSTMBody'))(EmbedBody)

Concat = concatenate([LSTMHead,LSTMBody])

DenseLayer = Dense(128,activation='relu')(Concat)
DenseLayer = Dropout(0.4)(DenseLayer)
DenseLayer = Dense(4,activation='softmax')(DenseLayer)
LSTMNetwork = Model(inputs=[InputHead,InputBody], outputs=[DenseLayer])
LSTMNetwork.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['acc'])
print(LSTMNetwork.summary())

In [None]:
''' Train the model ~ takes roughly 10 hours '''
for i in range(10):
    LSTMNetwork.fit([trainHeads, trainBodies],[trainStances], epochs=4, batch_size=128,verbose = True)

In [None]:
''' Save all your hard work '''
LSTMNetwork.save('finalModel.5h')

In [None]:
from keras.models import load_model

LSTMNetwork = load_model('finalModel.5h')

In [None]:
''' Create test data appropriate for model '''
testHeads,testBodies,out = CreateNetworkData(testBodiesDF, testHeadDF, False)

In [None]:
''' Predict the test data '''
predictions = LSTMNetwork.predict([testHeads, testBodies])

In [None]:
''' 
    Convert predictions into csv approrpiate for evaluating
    - take argmax of predictions to determine classifcaiton
    - map these back to the appropriate stance in word
'''
testStancesDf = pd.read_csv('./DefaultFiles/test_stances_unlabeled.csv')
reverseMap = np.vectorize(lambda label: { 0:'unrelated', 1:'agree', 2:'disagree', 3:'discuss'}[label])
testPredsFinal = np.column_stack((testStancesDf, reverseMap(np.argmax(predictions,axis=1))))
testPredsFinal

In [None]:
''' One liner to save dataframe appropriately '''
pd.DataFrame(testPredsFinal, columns=['Headline', 'Body ID', 'Stance']).to_csv('testPredictions.csv', index=False)

In [None]:
'''
    Print out the confusion matrix of the predictions and evaluate score
'''

%run -i scorer.py DefaultFiles/competition_test_stances.csv testPredictions.csv