In [1]:
from stanfordcorenlp import StanfordCoreNLP
from load_glove_matrix import Glove

In [2]:
# load pre-trained Glove vectors, 300d
glove_model = Glove()
glove_model.load_model()

Loading pretrained Glove vectors from file ./data/glove.840B.300d.txt
  --9.11%  loaded.
  --18.21%  loaded.
  --27.32%  loaded.
  --36.43%  loaded.
  --45.54%  loaded.
  --54.64%  loaded.
  --63.75%  loaded.
  --72.86%  loaded.
  --81.97%  loaded.
  --91.07%  loaded.
Finished loading Glove model. 2196017 vectors loaded


In [3]:
# find all unique words and part of speach
def find_words_pos(words, tags): 
    uniqueWords = []
    posWords = []
    for i in words:
        if not i in uniqueWords:
            uniqueWords.append(i)
    for tag in tags:
        for word in uniqueWords:
            if word[1] == tag:
                posWords.append(word)
            else:
                continue
    return posWords

def find_words(words, trigger):
    l = []
    for word in words:
        if trigger not in ('word', 'pos'):
            print("Invalid trigger should be 'word' or 'pos'")
            break
        elif trigger == 'word':
            l.append(word[0])
        elif trigger == 'pos':
            l.append(word[1])
    return l

In [4]:
# make the twitter .csv as a .txt file and read all words from the file
pos = []
print("Start preparing words...")
# download CoreNLP 3.9.1 from https://stanfordnlp.github.io/CoreNLP/history.html
nlp = StanfordCoreNLP(r'C:\Users\sdzar\OneDrive\Documents\GitHub\CGT470-Final\data\stanford-corenlp-full-2018-02-27',memory='8g')
# this is the wiki corpus, you can change it to any other curpos you want
with open("./data/sentence.txt", "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        pos += nlp.pos_tag(line)
        if (i+1) % 1 == 0:
            print("  --{}%  loaded.".format(round(i/10*100, 2)))
nlp.close() # Do not forget to close! The backend server will consume a lot memery.
print("Done!")

Start preparing words...
  --0.0%  loaded.
  --10.0%  loaded.
  --20.0%  loaded.
  --30.0%  loaded.
  --40.0%  loaded.
  --50.0%  loaded.
  --60.0%  loaded.
  --70.0%  loaded.
Done!


In [5]:
# examples of result
for i in range(0,10):
    print("Pos {}: {}\n".format(i+1,pos[i]))

Pos 1: ('I', 'PRP')

Pos 2: ('love', 'VBP')

Pos 3: ('dogs', 'NNS')

Pos 4: ('.', '.')

Pos 5: ('I', 'PRP')

Pos 6: ('hate', 'VBP')

Pos 7: ('dogs', 'NNS')

Pos 8: ('.', '.')

Pos 9: ('I', 'PRP')

Pos 10: ('like', 'VBP')



In [6]:
tags_unique = ['NN',"NNP",'NNS','NNPS','VB',"VBD",'VBG','VBN', 'VBP','VBZ','JJ',"JJR", 'JJS','RB','RBR','RBS']
data_unique = find_words_pos(pos, tags_unique)
print("Unique: \n {}".format(data_unique))
print("Count of unique words: {}".format(len(data_unique)))

Unique: 
 [('police', 'NN'), ('puma', 'NN'), ('night', 'NN'), ('cat', 'NN'), ('yesterday', 'NN'), ('park', 'NN'), ('tree', 'NN'), ('rainbow', 'NN'), ('dogs', 'NNS'), ('puppies', 'NNS'), ('spotted', 'VBD'), ('saw', 'VBD'), ('dying', 'VBG'), ('love', 'VBP'), ('hate', 'VBP'), ('like', 'VBP'), ('is', 'VBZ'), ('last', 'JJ'), ('large', 'JJ'), ('new', 'JJ')]
Count of unique words: 20


In [7]:
tags_noun = ['NN',"NNP",'NNS','NNPS']
tags_verb = ['VB',"VBD",'VBG','VBN', 'VBP','VBZ']
tags_JJRB = ['JJ',"JJR", 'JJS','RB','RBR','RBS']
for i,pos in enumerate(data_unique):
    if pos[1] in tags_noun:
        data_unique[i] = (data_unique[i][0],"Noun")
    elif pos[1] in tags_verb:
        data_unique[i] = (data_unique[i][0],"Verb")
    elif pos[1] in tags_JJRB:
        data_unique[i] = (data_unique[i][0],"JJRB")
print("Unique: \n {}".format(data_unique))
print("Count of unique words: {}".format(len(data_unique)))

Unique: 
 [('police', 'Noun'), ('puma', 'Noun'), ('night', 'Noun'), ('cat', 'Noun'), ('yesterday', 'Noun'), ('park', 'Noun'), ('tree', 'Noun'), ('rainbow', 'Noun'), ('dogs', 'Noun'), ('puppies', 'Noun'), ('spotted', 'Verb'), ('saw', 'Verb'), ('dying', 'Verb'), ('love', 'Verb'), ('hate', 'Verb'), ('like', 'Verb'), ('is', 'Verb'), ('last', 'JJRB'), ('large', 'JJRB'), ('new', 'JJRB')]
Count of unique words: 20


In [8]:
# calculate the similarity between two words
def similarity(w1,w2):
    value = glove_model.similarity(w1,w2)
    return [w1,w2,value]

In [9]:
def sim(data_unique):
    sim_list = []
    for i, word1 in  enumerate(data_unique):
        counter = 1
        for j, word2 in enumerate(data_unique):
            if i == j: continue
            value = glove_model.similarity(word1[0],word2[0])
            if value > 0.5: # Change the threshold
                sim_list.append((word1[0],word1[1],word2[0],value))
                counter += 1
    return sim_list 
# sim_list[i][0] - word1
# sim_list[i][1] - POS of word1
# sim_list[i][2] - word2
# sim_list[i][3] - similarity value

In [10]:
# this method is used to count unique word that has similarity value > 0.5 within the corpus
def count_corpus(sim_list):
    unique = []
    for word in sim_list:
        if word[0] not in unique:
            unique.append(word[0])
    return len(unique)

In [11]:
# Testing for finding similar words in unique word list
print(sim(data_unique))
print("Unique words: {}".format(len(data_unique)))
print("Words that have similarity value > 0.5 within the corpus: {}".format(count_corpus(sim(data_unique))))

[('night', 'Noun', 'yesterday', 0.6068112818261747), ('night', 'Noun', 'saw', 0.5092502041657017), ('night', 'Noun', 'last', 0.6298856667608155), ('cat', 'Noun', 'dogs', 0.6921647999302281), ('cat', 'Noun', 'puppies', 0.5781784202694006), ('yesterday', 'Noun', 'night', 0.6068112818261747), ('yesterday', 'Noun', 'saw', 0.5903825419692302), ('yesterday', 'Noun', 'last', 0.7509270479250978), ('dogs', 'Noun', 'cat', 0.6921647999302281), ('dogs', 'Noun', 'puppies', 0.7978993201086826), ('puppies', 'Noun', 'cat', 0.5781784202694006), ('puppies', 'Noun', 'dogs', 0.7978993201086826), ('saw', 'Verb', 'night', 0.5092502041657017), ('saw', 'Verb', 'yesterday', 0.5903825419692302), ('saw', 'Verb', 'like', 0.5795228911657166), ('saw', 'Verb', 'last', 0.6219964413928809), ('love', 'Verb', 'hate', 0.6393098823113965), ('love', 'Verb', 'like', 0.65790401180881), ('hate', 'Verb', 'love', 0.6393098823113965), ('hate', 'Verb', 'like', 0.6574651482527226), ('like', 'Verb', 'saw', 0.5795228911657166), ('li

In [17]:
def find_unique(final_set):
    unique = []
    unique_return = []
    for i in final_set:
        if i[0] not in unique:
            unique.append(i[0])
            unique_return.append((i[0],i[1]))
    return unique_return

In [18]:
# this part is for the CGT project
final_set = sim(data_unique)
final_unique = find_unique(final_set)
data_noun = find_words_pos(final_set, ['Noun'])
data_verb = find_words_pos(final_set, ['Verb'])
data_JJRB = find_words_pos(final_set, ['JJRB'])
# word = find_words(data, 'word')
# tag = find_words(data, 'pos')
print("Count of final set: {}".format(len(final_set)))
print("Unique set: \n {}".format(final_unique))
print("Noun: \n {}".format(data_noun))
print("Verb: \n {}".format(data_verb))
print("Adjective & adverb: \n {}".format(data_JJRB))
print("Count of Noun: {}".format(len(data_noun)))
print("Count of Verb: {}".format(len(data_verb)))
print("Count of Adjective & adverb: {}".format(len(data_JJRB)))

Count of final set: 26
Unique set: 
 [('night', 'Noun'), ('cat', 'Noun'), ('yesterday', 'Noun'), ('dogs', 'Noun'), ('puppies', 'Noun'), ('saw', 'Verb'), ('love', 'Verb'), ('hate', 'Verb'), ('like', 'Verb'), ('last', 'JJRB')]
Noun: 
 [('night', 'Noun', 'yesterday', 0.6068112818261747), ('night', 'Noun', 'saw', 0.5092502041657017), ('night', 'Noun', 'last', 0.6298856667608155), ('cat', 'Noun', 'dogs', 0.6921647999302281), ('cat', 'Noun', 'puppies', 0.5781784202694006), ('yesterday', 'Noun', 'night', 0.6068112818261747), ('yesterday', 'Noun', 'saw', 0.5903825419692302), ('yesterday', 'Noun', 'last', 0.7509270479250978), ('dogs', 'Noun', 'cat', 0.6921647999302281), ('dogs', 'Noun', 'puppies', 0.7978993201086826), ('puppies', 'Noun', 'cat', 0.5781784202694006), ('puppies', 'Noun', 'dogs', 0.7978993201086826)]
Verb: 
 [('saw', 'Verb', 'night', 0.5092502041657017), ('saw', 'Verb', 'yesterday', 0.5903825419692302), ('saw', 'Verb', 'like', 0.5795228911657166), ('saw', 'Verb', 'last', 0.62199644

In [40]:
import json
#to Json file
word = {
        'name': "Words",
        'children': []
        }
noun = {
        'name': "NOUN",
        'children': []
        }
verb = {
        'name': "VERB",
        'children': []
        }
JJRB = {
        'name': "ADV_ADJ",
        'children': []
        }
similarWords = {
            'name': "",
            'children': []
            }
with open('data.json','w') as out:
    for unique in final_unique: 
        if unique[1] == "Noun":
            uniqueWord = {
                        'name': "",
                        'children': []
                        }
            for similar in data_noun:
                if unique[0] == similar[0]:
                    similarWords = {
                        'name': similar[2],
                        'similarity': similar[3]
                    }
                    uniqueWord['name'] = unique[0]
                    #print(unique[0])
                    #print(similarWords)
                    uniqueWord['children'].append(similarWords)
                    #print(uniqueWord)
            noun['children'].append(uniqueWord)
            # print(noun)
        elif unique[1] == "Verb":
            uniqueWord = {
                        'name': "",
                        'children': []
                        }
            for similar in data_verb:
                if unique[0] == similar[0]:
                    similarWords = {
                        'name': similar[2],
                        'similarity': similar[3]
                    }
                    uniqueWord['name'] = unique[0]
                    #print(unique[0])
                    #print(similarWords)
                    uniqueWord['children'].append(similarWords)
                    #print(uniqueWord)
            verb['children'].append(uniqueWord)
            # print(noun)
        elif unique[1] == "JJRB":
            uniqueWord = {
                        'name': "",
                        'children': []
                        }
            for similar in data_JJRB:
                if unique[0] == similar[0]:
                    similarWords = {
                        'name': similar[2],
                        'similarity': similar[3]
                    }
                    uniqueWord['name'] = unique[0]
                    #print(unique[0])
                    #print(similarWords)
                    uniqueWord['children'].append(similarWords)
                    #print(uniqueWord)
            JJRB['children'].append(uniqueWord)
            # print(noun)
            

    word['children'].append(noun)
    word['children'].append(verb)
    word['children'].append(JJRB)
    json.dump(word, out)
