# Downloads and Imports

## Download libs

In [None]:
!pip install numpy
!pip install matplotlib
!pip install scikit-learn
!pip install gensim
!pip install nltk

## Import

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import nltk

import os.path

from nltk.tokenize import word_tokenize
import string
import re
import pandas as pd
import scipy


## Download data

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
glove_file = datapath('/content/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

In [4]:
word2vec = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [None]:
word2vec.distance("object", "oriented")

# Loading data

In [None]:
texts = []

for i in range(5):
    first = ['A', 'B', 'C', 'D', 'E']
    for j in first:
        second = ['a', 'b', 'c', 'd', 'e']
        for k in second:
            filename = "g" + str(i) + "p" + j + "_task" + k + ".txt"
            filepath = "/content/data/" + filename
            if os.path.isfile(filepath) == True:
                reader = open(filepath, "r")
                data = reader.read()
                texts.append({"file": filename, "data": data})
print(texts)


# Convert to tokens


In [44]:
tokenized_text = []
for text in texts:
    ans_remove_punc = re.sub(r'[^\w\s]',' ', text["data"])
    ans = word_tokenize(ans_remove_punc)
    valid_token = []
    for i in range(len(ans)):
        ans[i] = ans[i].lower()
        if ans[i] in word2vec.vocab and ans[i] not in STOPWORDS:
            valid_token.append(ans[i])
        else:
            print("Throwing " + ans[i] + " from " + text["file"])
    tokenized_text.append({"file": text["file"], "tokens": valid_token})

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Throwing as from g2pA_taskd.txt
Throwing to from g2pA_taskd.txt
Throwing as from g2pA_taskd.txt
Throwing this from g2pA_taskd.txt
Throwing is from g2pA_taskd.txt
Throwing is from g2pA_taske.txt
Throwing a from g2pA_taske.txt
Throwing very from g2pA_taske.txt
Throwing in from g2pA_taske.txt
Throwing for from g2pA_taske.txt
Throwing or from g2pA_taske.txt
Throwing on from g2pA_taske.txt
Throwing the from g2pA_taske.txt
Throwing at from g2pA_taske.txt
Throwing each from g2pA_taske.txt
Throwing a from g2pA_taske.txt
Throwing of from g2pA_taske.txt
Throwing such from g2pA_taske.txt
Throwing an from g2pA_taske.txt
Throwing is from g2pA_taske.txt
Throwing to from g2pA_taske.txt
Throwing with from g2pA_taske.txt
Throwing we from g2pA_taske.txt
Throwing can from g2pA_taske.txt
Throwing our from g2pA_taske.txt
Throwing own from g2pA_taske.txt
Throwing which from g2pA_taske.txt
Throwing for from g2pA_taske.txt
Throwing all from g2pA

In [45]:
print(tokenized_text)

[{'file': 'g0pA_taska.txt', 'tokens': ['inheritance', 'basic', 'concept', 'object', 'oriented', 'programming', 'basic', 'idea', 'create', 'new', 'classes', 'add', 'extra', 'detail', 'existing', 'classes', 'done', 'allowing', 'new', 'classes', 'reuse', 'methods', 'variables', 'existing', 'classes', 'new', 'methods', 'classes', 'added', 'specialise', 'new', 'class', 'inheritance', 'models', 'kind', 'relationship', 'entities', 'objects', 'example', 'postgraduates', 'undergraduates', 'kinds', 'student', 'kind', 'relationship', 'visualised', 'tree', 'structure', 'student', 'would', 'general', 'root', 'node', 'postgraduate', 'undergraduate', 'would', 'specialised', 'extensions', 'student', 'node', 'child', 'nodes', 'relationship', 'student', 'would', 'known', 'superclass', 'parent', 'class', 'whereas', 'postgraduate', 'would', 'known', 'subclass', 'child', 'class', 'postgraduate', 'class', 'extends', 'student', 'class', 'inheritance', 'occur', 'several', 'layers', 'visualised', 'would', 'dis

# Find mean of word embeddings

In [46]:
vector_mean = []

for i in tokenized_text:
    vec = np.mean([word2vec[word] for word in i["tokens"]], axis=0)
    vector_mean.append({"file": i["file"], "vector": vec})

print(vector_mean)

[{'file': 'g0pA_taska.txt', 'vector': array([ 0.03682747,  0.4235451 , -0.18983568,  0.14155266, -0.01349626,
        0.09335126, -0.08435374,  0.19299449, -0.10363588,  0.42766824,
       -0.0384862 , -0.08065659,  0.2803837 ,  0.16030513,  0.09227832,
       -0.21437429,  0.06746031,  0.25355902, -0.12599465,  0.21021682,
       -0.2708714 , -0.21977133,  0.16757837, -0.06905884,  0.04969931,
       -0.34809905,  0.05586145, -0.46257317, -0.15549439, -0.00644431,
       -0.24425776,  0.6018561 , -0.22803889, -0.16131517,  0.04302749,
        0.2069174 , -0.02576416,  0.06138017, -0.17375913, -0.03900371,
       -0.28853452, -0.20816101, -0.07688361, -0.13293554, -0.04011067,
        0.05398416,  0.12196999,  0.07961066, -0.04655442, -0.16043346,
       -0.00780001, -0.12379491, -0.02366173,  0.6069876 ,  0.01406657,
       -1.359393  ,  0.17385918, -0.32367325,  1.3125274 ,  0.30284458,
       -0.10573031,  0.4427292 , -0.05222062,  0.00500962,  0.6737335 ,
        0.05240851,  0.156

In [47]:
print(vector_mean[0]["file"][-5])

a


#Load original text

In [None]:
original_text = []
tasks = ['a', 'b', 'c', 'd', 'e']
for i in tasks:
    filename = "orig_task" + i + ".txt"
    filepath = "/content/data/" + filename
    reader = open(filepath, "r")
    data = reader.read()
    original_text.append({"task": i, "data": data})
print(original_text)

In [None]:
original_tokenized = []
for task in original_text:
    ans_remove_punc = re.sub(r'[^\w\s]',' ', task["data"])
    ans = word_tokenize(ans_remove_punc)
    valid_token = []
    for i in range(len(ans)):
        ans[i] = ans[i].lower()
        if ans[i] in word2vec.vocab and ans[i] not in STOPWORDS:
            valid_token.append(ans[i])
        else:
            print("Throwing " + ans[i] + " from " + task["task"])
    original_tokenized.append({"task": task["task"], "tokens": valid_token})

In [41]:
print(original_tokenized)

[{'task': 'a', 'tokens': ['object', 'oriented', 'programming', 'inheritance', 'way', 'form', 'new', 'classes', 'instances', 'called', 'objects', 'using', 'classes', 'already', 'defined', 'inheritance', 'concept', 'invented', '1967', 'simula', 'new', 'classes', 'known', 'derived', 'classes', 'take', 'inherit', 'attributes', 'behavior', 'pre', 'existing', 'classes', 'referred', 'base', 'classes', 'ancestor', 'classes', 'intended', 'help', 'reuse', 'existing', 'code', 'little', 'modification', 'inheritance', 'provides', 'support', 'representation', 'categorization', 'computer', 'languages', 'categorization', 'powerful', 'mechanism', 'number', 'information', 'processing', 'crucial', 'human', 'learning', 'means', 'generalization', 'known', 'specific', 'entities', 'applied', 'wider', 'group', 'given', 'belongs', 'relation', 'established', 'cognitive', 'economy', 'less', 'information', 'needs', 'stored', 'specific', 'entity', 'particularities', 'inheritance', 'also', 'sometimes', 'called', 'g

In [42]:
original_vectors = []
for task in original_tokenized:
    vec = np.mean([word2vec[word] for word in task["tokens"]], axis=0)
    original_vectors.append({"task": task["task"], "vector": vec})

In [None]:
print(original_vectors)

#Predict results

In [48]:
results = []

for text in vector_mean:
    for original in original_vectors:
        if text["file"][-5] == original["task"]:
            cosine = scipy.spatial.distance.cosine(text["vector"], original["vector"])
            results.append({"file": text["file"], "distance": cosine * 100})

In [49]:
print(results)

[{'file': 'g0pA_taska.txt', 'distance': 6.829279661178589}, {'file': 'g0pA_taskb.txt', 'distance': 1.958930492401123}, {'file': 'g0pA_taskc.txt', 'distance': 0.4478871822357178}, {'file': 'g0pA_taskd.txt', 'distance': 6.83475136756897}, {'file': 'g0pA_taske.txt', 'distance': 2.0473241806030273}, {'file': 'g0pB_taska.txt', 'distance': 4.994392395019531}, {'file': 'g0pB_taskb.txt', 'distance': 3.2808244228363037}, {'file': 'g0pB_taskc.txt', 'distance': 0.3952443599700928}, {'file': 'g0pB_taskd.txt', 'distance': 1.9653558731079102}, {'file': 'g0pB_taske.txt', 'distance': 1.2279689311981201}, {'file': 'g0pC_taska.txt', 'distance': 2.8163671493530273}, {'file': 'g0pC_taskb.txt', 'distance': 3.499370813369751}, {'file': 'g0pC_taskc.txt', 'distance': 3.90017032623291}, {'file': 'g0pC_taskd.txt', 'distance': 3.792470693588257}, {'file': 'g0pC_taske.txt', 'distance': 3.246563673019409}, {'file': 'g0pD_taska.txt', 'distance': 1.1174380779266357}, {'file': 'g0pD_taskb.txt', 'distance': 4.68094944

In [50]:
results_df = pd.DataFrame(results)

In [51]:
results_df.head()

Unnamed: 0,file,distance
0,g0pA_taska.txt,6.82928
1,g0pA_taskb.txt,1.95893
2,g0pA_taskc.txt,0.447887
3,g0pA_taskd.txt,6.834751
4,g0pA_taske.txt,2.047324


In [66]:
results_df.to_csv('result.csv', index=False)
from google.colab import files
files.download('result.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Check results

In [53]:
labels = pd.read_excel("./corpus-final09.xls", sheet_name="File list")

In [54]:
labels.head()

Unnamed: 0,File,Group,Person,Task,Category,Native English,Knowledge,Difficulty
0,g0pA_taska.txt,0,A,a,non,native,1,1
1,g0pA_taskb.txt,0,A,b,cut,native,4,3
2,g0pA_taskc.txt,0,A,c,light,native,5,3
3,g0pA_taskd.txt,0,A,d,heavy,native,3,4
4,g0pA_taske.txt,0,A,e,non,native,4,3


In [55]:
results_list = []
for index1, row1 in labels.iterrows():
    for index2, row2 in results_df.iterrows():
        if row1["File"] == row2["file"]:
            results_list.append({"file": row1["File"], "type": row1["Category"], "distance": row2["distance"]})

In [56]:
def get_results(results_list, threshhold):
    false_positive = 0
    false_negative = 0
    true_positive = 0
    true_negative = 0
    total_positive = 0
    total_negative = 0
    for i in range(len(results_list)):
        if results_list[i]["distance"] <= threshhold:
            if results_list[i]["type"] == "non":
                false_positive += 1
                total_negative += 1
            else:
                true_positive += 1
                total_positive += 1
        else:
            if results_list[i]["type"] == "non":
                true_negative += 1
                total_negative += 1
            else:
                false_negative += 1
                total_positive += 1
    return true_positive, true_negative, false_positive, false_negative


In [57]:
def get_score(true_positive, true_negative, false_positive, false_negative):
    accuracy = (true_positive + true_negative) / (total_positive + total_negative)
    precision = true_positive / (true_positive + true_negative)
    recall = true_positive / (true_positive + false_negative)
    f_score = 2 * precision * recall / (precision + recall)
    return accuracy, f_score


In [67]:
true_positive, true_negative, false_positive, false_negative = get_results(results_list, threshhold=1.9653999999998937)
accuracy, f_score = get_score(true_positive, true_negative, false_positive, false_negative)
print("Total positives: " + str(total_positive))
print("Total negatives: " + str(total_negative))
print("False positive: " + str(false_positive))
print("False negative: " + str(false_negative))
print("True positive: " + str(true_positive))
print("True negative: " + str(true_negative))
print("Accuracy: " + str(accuracy))
print("f_score: " + str(f_score))

Total positives: 57
Total negatives: 38
False positive: 0
False negative: 25
True positive: 32
True negative: 38
Accuracy: 0.7368421052631579
f_score: 0.5039370078740157


In [65]:
bestthreshhold = 3.0
bestscore = 0.7157894736842105
trythreshhold=1.0
while trythreshhold <= 6:
    true_positive, true_negative, false_positive, false_negative = get_results(results_list, threshhold=trythreshhold)
    accuracy, f_score = get_score(true_positive, true_negative, false_positive, false_negative)
    if accuracy > bestscore:
        bestscore = accuracy
        bestthreshhold = trythreshhold
        print(str(trythreshhold) + " beats with score " + str(bestscore))
    trythreshhold += 0.0001

1.9589999999998944 beats with score 0.7263157894736842
1.9653999999998937 beats with score 0.7368421052631579
