# Downloads and Imports

## Download libs

In [None]:
!pip install numpy
!pip install matplotlib
!pip install scikit-learn
!pip install gensim
!pip install nltk

## Import

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import nltk

import os.path

from nltk.tokenize import word_tokenize
import string
import re
import pandas as pd
import scipy


## Download data

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
glove_file = datapath('/content/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

In [4]:
word2vec = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [None]:
word2vec.distance("object", "oriented")

# Loading data

In [None]:
texts = []

for i in range(5):
    first = ['A', 'B', 'C', 'D', 'E']
    for j in first:
        second = ['a', 'b', 'c', 'd', 'e']
        for k in second:
            filename = "g" + str(i) + "p" + j + "_task" + k + ".txt"
            filepath = "/content/data/" + filename
            if os.path.isfile(filepath) == True:
                reader = open(filepath, "r")
                data = reader.read()
                texts.append({"file": filename, "data": data})
print(texts)


# Convert to tokens


In [None]:
tokenized_text = []
for text in texts:
    ans_remove_punc = re.sub(r'[^\w\s]',' ', text["data"])
    ans = word_tokenize(ans_remove_punc)
    valid_token = []
    for i in range(len(ans)):
        ans[i] = ans[i].lower()
        if ans[i] in word2vec.vocab:
            valid_token.append(ans[i])
        else:
            print("Throwing " + ans[i] + " from " + text["file"])
    tokenized_text.append({"file": text["file"], "tokens": valid_token})

In [None]:
print(tokenized_text)

# Find mean of word embeddings

In [None]:
vector_mean = []

for i in tokenized_text:
    vec = np.mean([word2vec[word] for word in i["tokens"]], axis=0)
    vector_mean.append({"file": i["file"], "vector": vec})

print(vector_mean)

In [None]:
print(vector_mean[0]["file"][-5])

#Load original text

In [None]:
original_text = []
tasks = ['a', 'b', 'c', 'd', 'e']
for i in tasks:
    filename = "orig_task" + i + ".txt"
    filepath = "/content/data/" + filename
    reader = open(filepath, "r")
    data = reader.read()
    original_text.append({"task": i, "data": data})
print(original_text)

In [None]:
original_tokenized = []
for task in original_text:
    ans_remove_punc = re.sub(r'[^\w\s]',' ', task["data"])
    ans = word_tokenize(ans_remove_punc)
    valid_token = []
    for i in range(len(ans)):
        ans[i] = ans[i].lower()
        if ans[i] in word2vec.vocab:
            valid_token.append(ans[i])
        else:
            print("Throwing " + ans[i] + " from " + task["task"])
    original_tokenized.append({"task": task["task"], "tokens": valid_token})

In [None]:
print(original_tokenized)

In [14]:
original_vectors = []
for task in original_tokenized:
    vec = np.mean([word2vec[word] for word in task["tokens"]], axis=0)
    original_vectors.append({"task": task["task"], "vector": vec})

In [None]:
print(original_vectors)

#Predict results

In [16]:
results = []

for text in vector_mean:
    for original in original_vectors:
        if text["file"][-5] == original["task"]:
            cosine = scipy.spatial.distance.cosine(text["vector"], original["vector"])
            results.append({"file": text["file"], "distance": cosine * 100})

In [None]:
print(results)

In [18]:
results_df = pd.DataFrame(results)

In [19]:
results_df.head()

Unnamed: 0,file,distance
0,g0pA_taska.txt,1.76267
1,g0pA_taskb.txt,0.498301
2,g0pA_taskc.txt,0.185782
3,g0pA_taskd.txt,1.569027
4,g0pA_taske.txt,0.998652


In [20]:
results_df.to_csv('result.csv', index=False)
from google.colab import files
files.download('result.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Check results

In [21]:
labels = pd.read_excel("./corpus-final09.xls", sheet_name="File list")

In [22]:
labels.head()

Unnamed: 0,File,Group,Person,Task,Category,Native English,Knowledge,Difficulty
0,g0pA_taska.txt,0,A,a,non,native,1,1
1,g0pA_taskb.txt,0,A,b,cut,native,4,3
2,g0pA_taskc.txt,0,A,c,light,native,5,3
3,g0pA_taskd.txt,0,A,d,heavy,native,3,4
4,g0pA_taske.txt,0,A,e,non,native,4,3


In [24]:
results_list = []
for index1, row1 in labels.iterrows():
    for index2, row2 in results_df.iterrows():
        if row1["File"] == row2["file"]:
            results_list.append({"file": row1["File"], "type": row1["Category"], "distance": row2["distance"]})

In [29]:
def get_results(results_list, threshhold):
    false_positive = 0
    false_negative = 0
    true_positive = 0
    true_negative = 0
    total_positive = 0
    total_negative = 0
    for i in range(len(results_list)):
        if results_list[i]["distance"] <= threshhold:
            if results_list[i]["type"] == "non":
                false_positive += 1
                total_negative += 1
            else:
                true_positive += 1
                total_positive += 1
        else:
            if results_list[i]["type"] == "non":
                true_negative += 1
                total_negative += 1
            else:
                false_negative += 1
                total_positive += 1
    return true_positive, true_negative, false_positive, false_negative


In [30]:
def get_score(true_positive, true_negative, false_positive, false_negative):
    accuracy = (true_positive + true_negative) / (total_positive + total_negative)
    precision = true_positive / (true_positive + true_negative)
    recall = true_positive / (true_positive + false_negative)
    f_score = 2 * precision * recall / (precision + recall)
    return accuracy, f_score


In [31]:
true_positive, true_negative, false_positive, false_negative = get_results(results_list, threshhold=1.0)
accuracy, f_score = get_score(true_positive, true_negative, false_positive, false_negative)
print("Total positives: " + str(total_positive))
print("Total negatives: " + str(total_negative))
print("False positive: " + str(false_positive))
print("False negative: " + str(false_negative))
print("True positive: " + str(true_positive))
print("True negative: " + str(true_negative))
print("Accuracy: " + str(accuracy))
print("f_score: " + str(f_score))

Total positives: 57
Total negatives: 38
False positive: 15
False negative: 14
True positive: 43
True negative: 23
Accuracy: 0.6947368421052632
f_score: 0.6991869918699187


In [38]:
bestthreshhold = 1.0
bestscore = 0.6947368421052632
trythreshhold=0.9
while trythreshhold <= 2:
    true_positive, true_negative, false_positive, false_negative = get_results(results_list, threshhold=trythreshhold)
    accuracy, f_score = get_score(true_positive, true_negative, false_positive, false_negative)
    if accuracy > bestscore:
        bestscore = accuracy
        bestthreshhold = trythreshhold
        print(str(trythreshhold) + " beats with score " + str(bestscore))
    trythreshhold += 0.001

0.9920000000000001 beats with score 0.7052631578947368
