***Import libraries***

In [1]:
import json
import random
from nltk import pos_tag, word_tokenize
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score

In [2]:
random.seed(42)

***Helper functions for us***

In [3]:
##
# Split the data into the different areas which we want to test on
#
# input: entire corpus
#
# returns: subcorpora
##
def split_data(data):
    vaccine = []
    bigfoot = []
    flat = []
    pizza = []
    climate = []
    for entry in data:
        if entry['seeds'].__contains__('big.foot'):
            bigfoot.append(entry)
        if entry['seeds'].__contains__('vaccine'):
            vaccine.append(entry)
        if entry['seeds'].__contains__('flat.earth'):
            flat.append(entry)
        if entry['seeds'].__contains__('pizzagate'):
            pizza.append(entry)
        if entry['seeds'].__contains__('climate'):
            climate.append(entry)

    return bigfoot, vaccine, flat, pizza, climate

In [4]:
##
# A helper function for the conspiracy select, this returns the text entry from the corpus
#
# input: a corpus
#
# output: a breakdown of the input corpus into text (X) and label (Y) - note that 1 marks conspiracy and 0 marks non-conspiracy
##
def x_y_split(data):
    X = []
    Y = []
    
    for entry in data:
        if entry['subcorpus'] == 'conspiracy' \
            and entry['conspiracy_representative']:
            X.append(entry['txt'])
            Y.append(1)
        else:
            X.append(entry['txt'])
            Y.append(0)
    
    return X, Y

In [5]:
##
# How we preprocess our data/create our feature sets will probably have most impact, this is a place for that to happen
#
# input: corpus text
#
# returns: modified corpus text
##
def preprocess(data):
    update = []
    for entry in data:
        update.append(entry.lower())
    
    return update

In [6]:
## 
# Returns a combination of the input conspiracy corpora, and shuffles the input as well.
#
# input: sequence of tuples (corpus, percent), (corpus, percent),...
#
# returns: X and Y, such that X and Y are the percentages of the corpora given in the input 
#
# note: this does *not* ensure balance between the conspiracy/non-conspiracy elements
# within the partition
##
def conspiracy_select(*args):
    newX = []
    newY = []
    
    for arg in args:
        # get text from a corpus
        tmpX, tmpY = x_y_split(arg[0])

        # randomize the order of the texts and labels
        tmpX, tmpY = randomize(tmpX, tmpY)

        # partition the texts and labels according to the percentage
        tmplen = int(arg[1] * len(tmpX))
        tmpX = tmpX[:tmplen]
        tmpY = tmpY[:tmplen]

        # update our combined texts and labels
        newX += tmpX
        newY += tmpY

    return newX, newY

In [7]:
##
# Randomizes X and Y pairwise data
#
# input: X and Y lists
#
# returns: X and Y lists pairwise shuffled
#
##
def randomize(X, Y):
    tmp = list(zip(X, Y))
    random.shuffle(tmp)
    tmpX, tmpY = zip(*tmp)
    return list(tmpX), list(tmpY)

In [8]:
##
# Returns xtrain, xtest, ytrain, and ytest by letting one CT be the xtrain and ytrain, and the other CT be the xtest
# and ytest
#
# input: two conspiracy theories where CT1 -> training, CT2 -> testing
#
# returns xtrain, xtest, ytrain, and ytest
##
def custom_test_train_split(CT1, CT2):
    xtrain, ytrain = x_y_split(CT1)
    xtest, ytest = x_y_split(CT2)
    
    # uncomment this if we want to randomize the pairwise order of texts and labels
    # xtrain, ytrain = randomize(xtrain, ytrain)
    # xtest, ytest = randomize(xtest, ytest)
    
    return xtrain, xtest, ytrain, ytest

In [9]:
##
# Transforms a set of documents into POS tags
#
# input: an list of text documents
#
# returns: a list in the same order of documents transformed into POS tags
##
def pos_tags(X):
    newX = []
    for entry in X:
        tmp = []
        for _, tag in pos_tag(word_tokenize(entry)):
            tmp.append(tag)
        newX.append(tmp)

    return newX

***Create subcorpora from corpus***

In [10]:
with open('../data/LOCO_partition.json') as f:
        data = json.load(f)

In [11]:
bigfoot, vaccine, flat, pizza, climate = split_data(data)

***Set up input to the SVM***

In [12]:
# bigX, bigY = x_y_split(bigfoot)
# cliX, cliY = x_y_split(climate)
# X, Y = conspiracy_select((bigfoot, 1), (vaccine, 1), (flat, 1), (pizza, 1), (climate, 1))
xtrain, xtest, ytrain, ytest = custom_test_train_split(bigfoot, vaccine)

In [None]:
# bigX = preprocess(bigX)
# cliX = preprocess(cliX)
xtrain = pos_tags(xtrain)
xtest = pos_tags(xtest)
# X = preprocess(X)

In [64]:
# xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X, Y, test_size = 0.3)

In [65]:
ctvec = CountVectorizer(analyzer = 'word', tokenizer = word_tokenize, max_features = 5000)
vecxtrain = ctvec.fit_transform(xtrain)

In [66]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

In [67]:
SVM.fit(vecxtrain, ytrain)

In [68]:
vecxtest = ctvec.transform(xtest)

In [69]:
SVMpred = SVM.predict(vecxtest)

In [72]:
print("SVM Accuracy -> ", accuracy_score(SVMpred, ytest) * 100)

SVM Accuracy ->  97.13305067635777
