In [79]:
#required libraries
import nltk
import pandas as pd
from pdfminer.high_level import extract_text
import pickle
import re
import os
from cleantext import clean
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag, map_tag
from nltk import RegexpParser
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mrladidadi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mrladidadi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/mrladidadi/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [80]:
#uncomment to extract keywords from dictionary glossary pdf

# keyword = extract_text('kw_only.pdf')
# keyword = keyword.replace('\t',' ').replace('\x0c','')
# keyword = keyword[keyword.index('10 Base 2'):].lower().split('\n\n')
# pd.DataFrame(data = keyword).to_csv('kw_array.csv')

In [81]:
#read in definition keywords previously extracted from dictionary glossary
keyword = pd.read_csv('kw_array.csv')
keyword = keyword.dropna(how='any')
keyword = list(keyword['0'])

In [82]:
#read in definition sentences previously extracted from dictionary body
definition = open("raw_defs.txt", "r").read()
#clean various formatting characters
definition = definition.replace('\t',' ').replace('\x0c','').lower()
def_array = definition.split('\n\n')
for i in range(len(def_array)):
    def_array[i] = def_array[i].replace('\n', ' ')

In [83]:
#remove definitions referring to acronyms of other definitions
z = 0
while z < len(def_array):
    if 'abbrev.' in def_array[z]:
        del def_array[z]
    z+=1

In [84]:
#create array of definition objects by combining keywords and definition lists
master = []
for h in range(len(keyword)):
    temp = []
    for i in range(len(def_array)):
        if keyword[h] in def_array[i][0:len(keyword[h])]: #ensure def matches the kw
            excl_kw = def_array[i][def_array[i].index(keyword[h])+len(keyword[h])+1:] #cut off kw from def
            if excl_kw.find('.') != -1:
                excl_kw = excl_kw[0:excl_kw.index('.')]
            if len(excl_kw.split(" ")) < 5 and "see" in excl_kw: #exclude def if too short or refers to another definition
                break
            else:
                temp.append(excl_kw)
    if len(temp)==1:
        master.append( { 'kw':keyword[h], 'def':temp[0], 'def_clean':''} )

In [85]:
#corpus-specific cleaning conditions
for i in range(len(master)):
    #for keywords with multiple definitions; extract the first only
    one_find = master[i]['def'].find('1.')
    two_find = master[i]['def'].find('2.')
    if one_find != -1 and two_find != -1:
        master[i]['def'] = master[i]['def'][one_find+2:two_find]
        #clean grammar
    master[i]['def_clean'] = re.sub(r'\([^)]*\)', '', master[i]['def'])
    master[i]['def_clean'] = clean(master[i]['def_clean'],no_punct=True, lang="en")

In [86]:
#lematises / removes stopwords from given string
def lemat_sent(sent):
    corpus = []
    for i in sent:
        text_data = nltk.word_tokenize(i['def_clean'])
        wl = WordNetLemmatizer()
        text_data = [wl.lemmatize(word) for word in text_data if not word in stopwords.words('english')]
        text_data = ' '.join(text_data)
        corpus.append(text_data)
    return corpus

In [87]:
lemat_sent = lemat_sent(master)

In [88]:
#put all definition sentences in one string array
orig_sent = []
for i in range(len(master)):
    orig_sent.append(master[i]['def_clean'])

In [89]:
#Part-Of-Speech tagger
def pos_tag(sent):
    final = []
    for i in range(len(sent)):
        split = nltk.word_tokenize(sent[i])
        pos = nltk.pos_tag(split)
        sim_pos = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos] #uses simpler universal tags
        pos_arr = [i[1] for i in sim_pos]
        final.append(pos_arr) 
    return final

In [90]:
orig_sent_pos = pos_tag(orig_sent)
lemat_sent_pos = pos_tag(lemat_sent)

In [93]:
#finds highest frequency POS sequences of x length
def find_pattern_adv(pos, num):
    final = []
    for i in range(len(pos)):
        
        cat_str = ''
        
        if len(pos[i]) > num:
            for j in range(num):
                cat_str += pos[i][j] + " "
                
            final.append(cat_str)

    return pd.Series(final).value_counts()   

In [98]:
find_pattern_adv(orig_sent_pos, 8)

DET NOUN ADP NOUN ADP DET DET NOUN       16
DET NOUN ADP DET NOUN ADP DET NOUN       16
DET NOUN ADP VERB DET NOUN ADP DET       15
DET NOUN ADP DET NOUN NOUN DET VERB      15
DET NOUN VERB PRT VERB DET NOUN ADP      14
                                         ..
DET NOUN NOUN ADP DET NOUN CONJ ADJ       1
DET ADJ NOUN ADV DET ADJ NOUN VERB        1
DET ADJ NOUN NOUN ADV VERB ADP DET        1
DET NOUN VERB PRT VERB DET NOUN VERB      1
DET ADV ADJ NOUN ADP NOUN NOUN ADP        1
Length: 2159, dtype: int64

In [19]:
#these functions are desgined to parse the heavily nested AllenNLP constituency parse array
#reduces to a single string per level

def tree_lvl1(main):
    main = main['hierplane_tree']['root']['children']
    final = []
    for i in range(len(main)):
        final.append(main[i]['nodeType'])
    return final

def tree_lvl2(main):
    main = main['hierplane_tree']['root']['children']
    final = []
    for i in range(len(main)):
        if 'children' in main[i]:
            for j in range(len(main[i]['children'])):
                final.append(main[i]['children'][j]['nodeType'])
        else:
            final.append('?')
    return final

def tree_lvl3(main):
    main = main['hierplane_tree']['root']['children']
    final = []
    for i in range(len(main)):
        if 'children' in main[i]:
            for j in range(len(main[i]['children'])):
                if 'children' in main[i]['children'][j]:
                    for k in range(len(main[i]['children'][j]['children'])):
                        final.append(main[i]['children'][j]['children'][k]['nodeType'])
                else:
                    final.append('?')
        else:
             final.append('?')
    return final

def tree_lvl4(main):
    main = main['hierplane_tree']['root']['children']
    final = []
    for i in range(len(main)):
        if 'children' in main[i]:
            for j in range(len(main[i]['children'])):
                if 'children' in main[i]['children'][j]:
                    for k in range(len(main[i]['children'][j]['children'])):
                        if 'children' in main[i]['children'][j]['children'][k]:
                            for m in range(len(main[i]['children'][j]['children'][k]['children'])):
                                final.append(main[i]['children'][j]['children'][k]['children'][m]['nodeType'])
                        else:
                            final.append('?')
                else:
                    final.append('?')
        else:
            final.append('?')
    return final

def build_levels(main):
    return {
        'l1':tree_lvl1(main),
        'l2':tree_lvl2(main),
        'l3':tree_lvl3(main),
        'l4':tree_lvl4(main)
    }

In [24]:
#prepends keyword + "is" to definitions starting with DET
FINAL = []
for i in range(len(master)):
    FINAL.append( {'kw':master[i]['kw'],'sent':master[i]['def_clean'], 'pos':orig_sent_pos[i]} )
COMBINE = []
for i in range(len(FINAL)):
    if len(FINAL[i]['pos'])>=1 and FINAL[i]['pos'][0] == 'DET':
        COMBINE.append( {'sent':FINAL[i]['kw'] + ' is ' + FINAL[i]['sent'], 'pos':['NOUN','VERB']+FINAL[i]['pos']} )

In [99]:
len(COMBINE)

3200

In [28]:
#builds constituency parse trees master array
levels_final = []
for i in range(len(COMBINE)):
    temp_const = predictor.predict(
    sentence= COMBINE[i]['sent']
    )
    levels_final.append(build_levels(temp_const))
    if i%30 == 0:
        print(int(i/len(COMBINE)*100))

Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
90
91
92
93
94
95
96
97
98
99


In [29]:
with open('constituency_parse_dict.txt', 'wb') as f:
    pickle.dump(levels_final, f)

In [30]:
levels_final

[{'l1': ['NP', 'VP'],
  'l2': ['CD', 'NN', 'CD', 'VBZ', 'NP', 'ADVP', 'JJ', 'VP', 'CC', 'VP'],
  'l3': ['?',
   '?',
   '?',
   '?',
   'DT',
   'NN',
   'NN',
   'NN',
   'RB',
   '?',
   'VBG',
   'NP',
   '?',
   'VBG',
   'PP'],
  'l4': ['?',
   '?',
   '?',
   '?',
   '?',
   '?',
   '?',
   '?',
   '?',
   '?',
   '?',
   'NN',
   'JJ',
   'NN',
   '?',
   '?',
   'IN',
   'NP']},
 {'l1': ['NP', 'VP'],
  'l2': ['CD', 'NN', 'CD', 'VBZ', 'NP'],
  'l3': ['?', '?', '?', '?', 'NP', 'PP'],
  'l4': ['?', '?', '?', '?', 'DT', 'JJS', 'NN', 'PP', 'NP']},
 {'l1': ['NP', 'VP'],
  'l2': ['CD', 'NN', 'NN', 'VBZ', 'NP'],
  'l3': ['?', '?', '?', '?', 'NP', 'PP', 'VP'],
  'l4': ['?', '?', '?', '?', 'DT', 'NN', 'IN', 'NP', 'VBG', 'NP']},
 {'l1': ['NP', 'VP'],
  'l2': ['CD', 'NN', 'VBZ', 'NP'],
  'l3': ['?', '?', '?', 'NP', 'SBAR'],
  'l4': ['?', '?', '?', 'DT', 'JJ', 'NN', 'WHPP', 'S']},
 {'l1': ['NP', 'VP'],
  'l2': ['CD', 'VBZ', 'NP'],
  'l3': ['?', '?', 'NP', 'SBAR'],
  'l4': ['?', '?', 'DT', '

In [31]:
def pos_to_str(arr):
    final = ''
    for i in range(len(arr)):
        final += arr[i] + ' '
    return final.strip()

In [32]:
levels_final_sorted = [[],[],[],[]]
for obj in levels_final:
    levels_final_sorted[0].append(pos_to_str(obj['l1']))
    levels_final_sorted[1].append(pos_to_str(obj['l2']))
    levels_final_sorted[2].append(pos_to_str(obj['l3']))
    levels_final_sorted[3].append(pos_to_str(obj['l4']))

In [33]:
levels_final_sorted

[['NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VBZ NP VBP NP ADVP VBP NP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VBZ NP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VBZ NP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VBZ NP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VBZ NP',
  'VP HYPH RP NP VP NP',
  'NP VP',
  'NP VP',
  'NP VBZ NP',
  'S VP NP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'VP',
  'NP VP',
  'NP VP',
  'NP VBZ NP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP HYPH NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VP',
  'NP VBZ NP',
  'NP VP',
  'N

In [34]:
FINAL_LEVELS = []
for i in range(4):
    temp_levels = pd.DataFrame(levels_final_sorted[i])
    FINAL_LEVELS.append(temp_levels.value_counts()[temp_levels.value_counts()>=5].index.tolist())
    for j in range(len(FINAL_LEVELS[i])):
        FINAL_LEVELS[i][j] = FINAL_LEVELS[i][j][0]

In [35]:
FINAL_LEVELS

[['NP VP',
  'NP VBZ NP',
  'S VP',
  'NP HYPH NP VP',
  'LST HYPH NP VP',
  'VP VP',
  'S HYPH NP VP',
  'ADJP VP',
  'S CC S',
  'VP VBZ NP'],
 ['NNP VBZ NP',
  'JJ NN VBZ NP',
  'NN NN VBZ NP',
  'NNP NNP VBZ NP',
  'NN VBZ NP',
  'NNP NN VBZ NP',
  'NP HYPH NP VBZ NP',
  'JJ NNP VBZ NP',
  'NN NNP VBZ NP',
  'JJ NN NN VBZ NP',
  'NN NN NN VBZ NP',
  'NNP NNP NNP VBZ NP',
  'VP VBZ NP',
  'VBN NN VBZ NP',
  'NNP NNP NN VBZ NP',
  'NNS NN VBZ NP',
  'ADJP HYPH NP VBZ NP',
  'JJ NNS VBZ NP',
  'JJ JJ NN VBZ NP',
  'NNS NNP VBZ NP',
  'NN NNS VBZ NP',
  'NP PP VBZ NP',
  'NNP NN NN VBZ NP',
  'NNS VBZ NP',
  'JJ HYPH NN NN VBZ NP',
  'VBG NN VBZ NP',
  'JJ NNS NN VBZ NP',
  'VBN NNP VBZ NP',
  'NN ? NN NN VBZ NP',
  'NNP NNS VBZ NP',
  'NN NN ? NP PP',
  'ADJP NN VBZ NP',
  'VB NN VBZ NP',
  'JJ NNP NN VBZ NP',
  'NN NNP NN VBZ NP',
  'ADJP HYPH NN NN VBZ NP',
  'NNP VBZ NP PP',
  'VP ? NN NN VBZ NP',
  'VBN NN NN VBZ NP',
  'NNP ? NP SBAR',
  'NNP ? NNP VBZ NP',
  'JJ NN VBZ NP PP',
 

In [36]:
with open('constituency_parse_dict_top.txt', 'wb') as f:
    pickle.dump(FINAL_LEVELS, f)

In [37]:
# final4 = []
# for h in range(4,10):
#     final3 = []
#     # num=5
#     for i in range(len(test)):

#         cat_str = ''

#         if len(test[i]['pos']) > h:
#             for j in range(h):
#                 cat_str += test[i]['pos'][j] + " "

#             final3.append(cat_str.strip())
            
#     final3 = pd.Series(final3)
#     final3 = final3.value_counts()[final3.value_counts()>2].index.tolist()
#     final4.append(final3)

NameError: name 'test' is not defined

In [None]:
# final4[2]

In [None]:
# with open('pos_dict.txt', 'wb') as f:
#     pickle.dump(final4, f)