In [1]:
from stanfordcorenlp import StanfordCoreNLP
from load_glove_matrix import Glove

In [2]:
# load pre-trained Glove vectors, 300d
glove_model = Glove()
glove_model.load_model()

Loading pretrained Glove vectors from file C:/Users/sdzar/Documents/GitHub/CNIT581-NLT/data/glove.840B.300d.txt
  --9.11%  loaded.
  --18.21%  loaded.
  --27.32%  loaded.
  --36.43%  loaded.
  --45.54%  loaded.
  --54.64%  loaded.
  --63.75%  loaded.
  --72.86%  loaded.
  --81.97%  loaded.
  --91.07%  loaded.
Finished loading Glove model. 2196017 vectors loaded


In [13]:
# find all unique words and part of speach
def find_words_pos(words, tags): 
    uniqueWords = []
    posWords = []
    for i in words:
        if not i in uniqueWords:
            uniqueWords.append(i)
    for tag in tags:
        for word in uniqueWords:
            if word[1] == tag:
                posWords.append(word)
            else:
                continue
    return posWords

def find_words(words, trigger):
    l = []
    for word in words:
        if trigger not in ('word', 'pos'):
            print("Invalid trigger should be 'word' or 'pos'")
            break
        elif trigger == 'word':
            l.append(word[0])
        elif trigger == 'pos':
            l.append(word[1])
    return l

In [4]:
# make the twitter .csv as a .txt file and read all words from the file
pos = []
print("Start preparing words...")
# download CoreNLP 3.9.1 from https://stanfordnlp.github.io/CoreNLP/history.html
nlp = StanfordCoreNLP(r'C:\Users\sdzar\Documents\GitHub\CNIT581-NLT\data\stanford-corenlp-full-2018-02-27',memory='8g')
# this is the wiki corpus, you can change it to any other curpos you want
with open("./data/sentence.txt", "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        pos += nlp.pos_tag(line)
        if (i+1) % 1 == 0:
            print("  --{}%  loaded.".format(round(i/10*100, 2)))
nlp.close() # Do not forget to close! The backend server will consume a lot memery.
print("Done!")

Start preparing words...
  --0.0%  loaded.
  --10.0%  loaded.
  --20.0%  loaded.
  --30.0%  loaded.
  --40.0%  loaded.
  --50.0%  loaded.
  --60.0%  loaded.
  --70.0%  loaded.
Done!


In [5]:
# examples of result
for i in range(0,10):
    print("Pos {}: {}\n".format(i+1,pos[i]))
print(uniqueWords)

Pos 1: ('I', 'PRP')

Pos 2: ('love', 'VBP')

Pos 3: ('dogs', 'NNS')

Pos 4: ('.', '.')

Pos 5: ('I', 'PRP')

Pos 6: ('hate', 'VBP')

Pos 7: ('dogs', 'NNS')

Pos 8: ('.', '.')

Pos 9: ('I', 'PRP')

Pos 10: ('like', 'VBP')

[]


In [6]:
tags_unique = ['NN',"NNP",'NNS','NNPS','VB',"VBD",'VBG','VBN', 'VBP','VBZ','JJ',"JJR", 'JJS','RB','RBR','RBS']
data_unique = find_words_pos(pos, tags_unique)
print("Unique: \n {}".format(data_unique))
print("Count of unique words: {}".format(len(data_unique)))

Unique: 
 [('police', 'NN'), ('puma', 'NN'), ('night', 'NN'), ('cat', 'NN'), ('yesterday', 'NN'), ('park', 'NN'), ('tree', 'NN'), ('rainbow', 'NN'), ('dogs', 'NNS'), ('puppies', 'NNS'), ('spotted', 'VBD'), ('saw', 'VBD'), ('dying', 'VBG'), ('love', 'VBP'), ('hate', 'VBP'), ('like', 'VBP'), ('is', 'VBZ'), ('last', 'JJ'), ('large', 'JJ'), ('new', 'JJ')]
Count of unique words: 20


In [7]:
# calculate the similarity between two words
def similarity(w1,w2):
    value = glove_model.similarity(w1,w2)
    return [w1,w2,value]

In [8]:
def sim(data_unique):
    sim_list = []
    for i, word1 in  enumerate(data_unique):
        counter = 1
        for j, word2 in enumerate(data_unique):
            if i == j: continue
            value = glove_model.similarity(word1[0],word2[0])
            if value > 0.45: # Change the threshold
                sim_list.append((word1[0],word1[1],word2[0],value))
                counter += 1
    return sim_list 
# sim_list[i][0] - word1
# sim_list[i][1] - POS of word1
# sim_list[i][2] - word2
# sim_list[i][3] - similarity value

In [9]:
# this method is used to count unique word that has similarity value > 0.5 within the corpus
def count_corpus(sim_list):
    unique = []
    for word in sim_list:
        if word[0] not in unique:
            unique.append(word[0])
    return len(unique)

In [10]:
# Testing for finding similar words in unique word list
print(sim(data_unique))
print("Unique words: {}".format(len(data_unique)))
print("Words that have similarity value > 0.5 within the corpus: {}".format(count_corpus(sim(data_unique))))

[('night', 'NN', 'yesterday', 0.6068112818261747), ('night', 'NN', 'saw', 0.5092502041657018), ('night', 'NN', 'last', 0.6298856667608155), ('cat', 'NN', 'dogs', 0.692164799930228), ('cat', 'NN', 'puppies', 0.5781784202694006), ('yesterday', 'NN', 'night', 0.6068112818261747), ('yesterday', 'NN', 'saw', 0.5903825419692303), ('yesterday', 'NN', 'last', 0.7509270479250979), ('dogs', 'NNS', 'cat', 0.692164799930228), ('dogs', 'NNS', 'puppies', 0.7978993201086827), ('puppies', 'NNS', 'cat', 0.5781784202694006), ('puppies', 'NNS', 'dogs', 0.7978993201086827), ('spotted', 'VBD', 'saw', 0.4791552074540715), ('saw', 'VBD', 'night', 0.5092502041657018), ('saw', 'VBD', 'yesterday', 0.5903825419692303), ('saw', 'VBD', 'spotted', 0.4791552074540715), ('saw', 'VBD', 'love', 0.46734499221430786), ('saw', 'VBD', 'like', 0.5795228911657166), ('saw', 'VBD', 'last', 0.6219964413928809), ('dying', 'VBG', 'like', 0.46911952084954556), ('love', 'VBP', 'saw', 0.46734499221430786), ('love', 'VBP', 'hate', 0.

In [16]:
# this part is for the CGT project
final_set = sim(data_unique)
tags_noun = ['NN',"NNP",'NNS','NNPS']
data_noun = find_words_pos(final_set, tags_noun)
tags_verb = ['VB',"VBD",'VBG','VBN', 'VBP','VBZ']
data_verb = find_words_pos(final_set, tags_verb)
tags_JJRB = ['JJ',"JJR", 'JJS','RB','RBR','RBS']
data_JJRB = find_words_pos(final_set, tags_JJRB)
# word = find_words(data, 'word')
# tag = find_words(data, 'pos')
print("Count of final set: {}".format(len(final_set)))
print("Noun: \n {}".format(data_noun))
print("Verb: \n {}".format(data_verb))
print("Adjective & adverb: \n {}".format(data_JJRB))
print("Count of Noun: {}".format(len(data_noun)))
print("Count of Verb: {}".format(len(data_verb)))
print("Count of Adjective & adverb: {}".format(len(data_JJRB)))

Count of final set: 34
Noun: 
 [('night', 'NN', 'yesterday', 0.6068112818261747), ('night', 'NN', 'saw', 0.5092502041657018), ('night', 'NN', 'last', 0.6298856667608155), ('cat', 'NN', 'dogs', 0.692164799930228), ('cat', 'NN', 'puppies', 0.5781784202694006), ('yesterday', 'NN', 'night', 0.6068112818261747), ('yesterday', 'NN', 'saw', 0.5903825419692303), ('yesterday', 'NN', 'last', 0.7509270479250979), ('dogs', 'NNS', 'cat', 0.692164799930228), ('dogs', 'NNS', 'puppies', 0.7978993201086827), ('puppies', 'NNS', 'cat', 0.5781784202694006), ('puppies', 'NNS', 'dogs', 0.7978993201086827)]
Verb: 
 [('spotted', 'VBD', 'saw', 0.4791552074540715), ('saw', 'VBD', 'night', 0.5092502041657018), ('saw', 'VBD', 'yesterday', 0.5903825419692303), ('saw', 'VBD', 'spotted', 0.4791552074540715), ('saw', 'VBD', 'love', 0.46734499221430786), ('saw', 'VBD', 'like', 0.5795228911657166), ('saw', 'VBD', 'last', 0.6219964413928809), ('dying', 'VBG', 'like', 0.46911952084954556), ('love', 'VBP', 'saw', 0.467344

In [12]:
#to Json file
class Node(object):
    def _init_(self, name, size = None, extra = None):
        self.name = name
        self.children = []
        self.size = size
        self.extra = extra

    def as_dict(self):
        res = {'name':self.name}

        return res

with open('trying_so_hard.json','w') as out:
    root = {'name': 'Words','children': []}

    if data_noun[0] in ('N','V','J','R'):
        root['children'].append(
            {
            'name': "NOUN", 
            'children': [data_noun[1]]
            }
        )
    if data_verb[0] in ('N','V','J','R'):
        root['children'].append(
            {
            'name': 'VERB', 
            'children': [data_verb[1]]
            }
        )
    if data_JJRB[0] in ('N','V','J','R'):
        root['children'].append(
            {
            'name': 'ADV_ADJ', 
            'children': [data_JJRB[1]]
            }
        )
    json.dump(root, out)

NameError: name 'json' is not defined