# Downloads and Imports

## Download libs

In [None]:
!pip install numpy
!pip install matplotlib
!pip install scikit-learn
!pip install gensim
!pip install nltk

## Import

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import nltk

import os.path

from nltk.tokenize import word_tokenize
import string
import re
import pandas as pd
import scipy


## Download data

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
glove_file = datapath('/content/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

In [4]:
word2vec = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [None]:
word2vec.distance("object", "oriented")

# Loading data

In [None]:
texts = []

for i in range(5):
    first = ['A', 'B', 'C', 'D', 'E']
    for j in first:
        second = ['a', 'b', 'c', 'd', 'e']
        for k in second:
            filename = "g" + str(i) + "p" + j + "_task" + k + ".txt"
            filepath = "/content/data/" + filename
            if os.path.isfile(filepath) == True:
                reader = open(filepath, "r")
                data = reader.read()
                texts.append({"file": filename, "data": data})
print(texts)


# Convert to tokens


In [44]:
tokenized_text = []
for text in texts:
    ans_remove_punc = re.sub(r'[^\w\s]',' ', text["data"])
    ans = word_tokenize(ans_remove_punc)
    valid_token = []
    for i in range(len(ans)):
        ans[i] = ans[i].lower()
        if ans[i] in word2vec.vocab and ans[i] not in STOPWORDS:
            valid_token.append(ans[i])
        else:
            print("Throwing " + ans[i] + " from " + text["file"])
    tokenized_text.append({"file": text["file"], "tokens": valid_token})

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Throwing as from g2pA_taskd.txt
Throwing to from g2pA_taskd.txt
Throwing as from g2pA_taskd.txt
Throwing this from g2pA_taskd.txt
Throwing is from g2pA_taskd.txt
Throwing is from g2pA_taske.txt
Throwing a from g2pA_taske.txt
Throwing very from g2pA_taske.txt
Throwing in from g2pA_taske.txt
Throwing for from g2pA_taske.txt
Throwing or from g2pA_taske.txt
Throwing on from g2pA_taske.txt
Throwing the from g2pA_taske.txt
Throwing at from g2pA_taske.txt
Throwing each from g2pA_taske.txt
Throwing a from g2pA_taske.txt
Throwing of from g2pA_taske.txt
Throwing such from g2pA_taske.txt
Throwing an from g2pA_taske.txt
Throwing is from g2pA_taske.txt
Throwing to from g2pA_taske.txt
Throwing with from g2pA_taske.txt
Throwing we from g2pA_taske.txt
Throwing can from g2pA_taske.txt
Throwing our from g2pA_taske.txt
Throwing own from g2pA_taske.txt
Throwing which from g2pA_taske.txt
Throwing for from g2pA_taske.txt
Throwing all from g2pA

In [None]:
print(tokenized_text)

# Find mean of word embeddings

In [None]:
vector_mean = []

for i in tokenized_text:
    freq = {}
    for token in i["tokens"]:
        if token in freq.keys():
            freq[token] += 1
        else:
            freq[token] = 1
    a = 0.001
    vec = np.mean([word2vec[word] * (a / (a + freq[word])) for word in i["tokens"]], axis=0)
    vector_mean.append({"file": i["file"], "vector": vec})

print(vector_mean)

In [73]:
print(vector_mean[0]["file"][-5])

a


#Load original text

In [None]:
original_text = []
tasks = ['a', 'b', 'c', 'd', 'e']
for i in tasks:
    filename = "orig_task" + i + ".txt"
    filepath = "/content/data/" + filename
    reader = open(filepath, "r")
    data = reader.read()
    original_text.append({"task": i, "data": data})
print(original_text)

In [None]:
original_tokenized = []
for task in original_text:
    ans_remove_punc = re.sub(r'[^\w\s]',' ', task["data"])
    ans = word_tokenize(ans_remove_punc)
    valid_token = []
    for i in range(len(ans)):
        ans[i] = ans[i].lower()
        if ans[i] in word2vec.vocab and ans[i] not in STOPWORDS:
            valid_token.append(ans[i])
        else:
            print("Throwing " + ans[i] + " from " + task["task"])
    original_tokenized.append({"task": task["task"], "tokens": valid_token})

In [None]:
print(original_tokenized)

In [77]:
original_vectors = []
for task in original_tokenized:
    freq = {}
    for token in task["tokens"]:
        if token in freq.keys():
            freq[token] += 1
        else:
            freq[token] = 1
    a = 0.001
    vec = np.mean([word2vec[word] * (a / (a + freq[word])) for word in task["tokens"]], axis=0)
    original_vectors.append({"task": task["task"], "vector": vec})

In [None]:
print(original_vectors)

#Predict results

In [78]:
results = []

for text in vector_mean:
    for original in original_vectors:
        if text["file"][-5] == original["task"]:
            cosine = scipy.spatial.distance.cosine(text["vector"], original["vector"])
            results.append({"file": text["file"], "distance": cosine * 100})

In [None]:
print(results)

In [80]:
results_df = pd.DataFrame(results)

In [None]:
results_df.head()

In [91]:
results_df.to_csv('result.csv', index=False)
from google.colab import files
files.download('result.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Check results

In [83]:
labels = pd.read_excel("./corpus-final09.xls", sheet_name="File list")

In [None]:
labels.head()

In [85]:
results_list = []
for index1, row1 in labels.iterrows():
    for index2, row2 in results_df.iterrows():
        if row1["File"] == row2["file"]:
            results_list.append({"file": row1["File"], "type": row1["Category"], "distance": row2["distance"]})

In [86]:
def get_results(results_list, threshhold):
    false_positive = 0
    false_negative = 0
    true_positive = 0
    true_negative = 0
    total_positive = 0
    total_negative = 0
    for i in range(len(results_list)):
        if results_list[i]["distance"] <= threshhold:
            if results_list[i]["type"] == "non":
                false_positive += 1
                total_negative += 1
            else:
                true_positive += 1
                total_positive += 1
        else:
            if results_list[i]["type"] == "non":
                true_negative += 1
                total_negative += 1
            else:
                false_negative += 1
                total_positive += 1
    return true_positive, true_negative, false_positive, false_negative


In [1]:
def get_score(true_positive, true_negative, false_positive, false_negative):
    accuracy = (true_positive + true_negative) / (total_positive + total_negative)
    f_score = true_positive /  (true_positive + (true_negative + false_negative) / 2)
    return accuracy, f_score


In [90]:
true_positive, true_negative, false_positive, false_negative = get_results(results_list, threshhold=2.4572000000008547)
accuracy, f_score = get_score(true_positive, true_negative, false_positive, false_negative)
print("Total positives: " + str(total_positive))
print("Total negatives: " + str(total_negative))
print("False positive: " + str(false_positive))
print("False negative: " + str(false_negative))
print("True positive: " + str(true_positive))
print("True negative: " + str(true_negative))
print("Accuracy: " + str(accuracy))
print("f_score: " + str(f_score))

Total positives: 57
Total negatives: 38
False positive: 6
False negative: 13
True positive: 44
True negative: 32
Accuracy: 0.8
f_score: 0.6616541353383459


In [89]:
bestthreshhold = 2.0
bestscore = 0.7368421052631579
trythreshhold=1.0
while trythreshhold <= 6:
    true_positive, true_negative, false_positive, false_negative = get_results(results_list, threshhold=trythreshhold)
    accuracy, f_score = get_score(true_positive, true_negative, false_positive, false_negative)
    if accuracy > bestscore:
        bestscore = accuracy
        bestthreshhold = trythreshhold
        print(str(trythreshhold) + " beats with score " + str(accuracy))
    trythreshhold += 0.0001

2.0399999999999743 beats with score 0.7473684210526316
2.1310000000001663 beats with score 0.7578947368421053
2.134700000000174 beats with score 0.7684210526315789
2.139000000000183 beats with score 0.7789473684210526
2.4509000000008414 beats with score 0.7894736842105263
2.4572000000008547 beats with score 0.8


In [None]:
# best current accuracy: 0.8
# best accuracy threshhold: 2.4572000000008547
# best current f_score: 5.920599999999634
# best f)scire threshhold: 0.9655172413793103