In [1]:
import nltk
nltk.download('punkt')
import pyfiglet
import os
import numpy as np
import pickle
import wikipedia

import import_ipynb
import preprocess_utils
import parse_utils
import ai_kw_detect
import wikt_def_parse_Wex
import wikt_def_predict_Wex
import keyword_utils


[nltk_data] Downloading package punkt to
[nltk_data]     /home/jackragless/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/jackragless/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/jackragless/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


importing Jupyter notebook from preprocess_utils.ipynb
importing Jupyter notebook from parse_utils.ipynb


[nltk_data] Downloading package benepar_en3 to
[nltk_data]     /home/jackragless/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


importing Jupyter notebook from ai_kw_detect.ipynb
importing Jupyter notebook from wikt_def_parse_Wex.ipynb
importing Jupyter notebook from wikt_def_predict_Wex.ipynb
importing Jupyter notebook from keyword_utils.ipynb


In [2]:
# pages = ['Computational Intelligence', 'Dialogflow', 'Bayesian programming', 'BabyX', 'Fuzzy agent']
# pages = ['Augmented reality-assisted surgery','Universal Scene Description','Junaio','USens','ARCore']
# for page in pages:
#     text_file = open("text_files/{}.txt".format(page), "w")
#     text_file.write(wikipedia.page(page).content)
#     text_file.close()

In [3]:
pos_association = {
    'CC':['conjunction'],
    'CD':['numeral'],
    'DT':['determiner'],
    'EX':[],
    'FW':[],
    'IN':['preposition','conjunction'],
    'JJ':['adjective'],
    'JJR':['adjective'],
    'JJS':['adjective'],
    'LS':[],
    'MD':['verb'],
    'NN':['noun','proper noun'],
    'NNS':['noun'], #PLURAL
    'NNP':['proper noun', 'noun'],
    'NNPS':['proper noun', 'noun'], #plural
    'PDT':['determiner'],
    'POS':[],
    'PRP':['pronoun'],
    'PRP$':['pronoun'],
    'RB':['adverb'],
    'RBR':['adverb'],
    'RBS':['adverb'],
    'RP':['preposition'], #unsure
    'TO':[], #unsure
    'UH':['interjection'],
    'VB':['verb'],
    'VBG':['verb'],
    'VBD':['verb'],
    'VBN':['verb'],
    'VBP':['verb'],
    'VBZ':['verb'],
    'WDT':['determiner'],
    'WP':['pronoun'],
    'WRB':['adverb'],
    
    #to deal with keyphrases
    'noun':['noun', 'proper noun'],
    'verb':['verb']
}

In [4]:
def generate_keywords(processed_text):
    
    candidate_phrases = []
    pos = []
    tok_sents = []
    
    for sent in nltk.sent_tokenize(processed_text):
        
        temp_tree = parse_utils.parseSent(sent)
        if temp_tree:
            tok_sents.append(sent[:-1])
            candidate_phrases += parse_utils.getPhraseNodes(temp_tree,[])
            pos.append( parse_utils.getWordNodes(temp_tree,[]) )
        
        
    pred_kw = ai_kw_detect.predict(tok_sents)
    
    
    FINAL_KW = []
    kw_only = []
    
    for i in range(len(pred_kw)):
        if pred_kw[i]:
            for j in range(len(pred_kw[i])):
                if len(pred_kw[i][j].split()) == 1:
                    for k in range(len(pos[i])):  
                        if pred_kw[i][j] == pos[i][k][0] and pred_kw[i][j] not in kw_only:
                            FINAL_KW.append([pos[i][k][0], pos_association[pos[i][k][1]]])
                            kw_only.append(pos[i][k][0])
                            break
                else:
                    for m in range(len(candidate_phrases)):
                        
                        if pred_kw[i][j] == candidate_phrases[m][0] and pred_kw[i][j] not in kw_only:
                            FINAL_KW.append([candidate_phrases[m][0],pos_association[candidate_phrases[m][1]]])
                            kw_only.append(candidate_phrases[m][0])
                            break
                    
        
    return FINAL_KW

In [5]:
def generate_definitions(title, clean_text, keyword_arr):
    final = []
    count = 0
    for kw in keyword_arr:
        count += 1
        print(count,'/',len(keyword_arr), end='\r')
        
        if kw[0].lower() not in [kw[0].lower() for kw in final]:
            for pos in kw[1]:
                temp_def = wikt_def_predict_Wex.driver(title, clean_text, kw[0], pos)
    #             print(kw[0], '<--->', temp_def)
                if temp_def == 'invalid-term':
                    final.append([kw[0],'invalid-term'])
                elif temp_def == 'invalid-pos':
                    final.append([kw[0],'invalid-pos'])
                else:
                    final.append([kw[0],temp_def])
                    break

    return final

In [6]:
def add_def_refs(orig_text, keywords_only):
    indexes = []
    index_sum = 0
    for kw in keywords_only:
        temp_index = orig_text[index_sum:].lower().find(kw.lower()) + len(kw)
        index_sum += temp_index
        indexes.append(index_sum)
    
    ref_text = orig_text
    index_adjust = 0
    for i in range(len(indexes)):
        ref_text = ref_text[:indexes[i]+index_adjust] + '|{}|'.format(i+1) + ref_text[indexes[i]+index_adjust:]
        index_adjust += len(str(i)) + 2
    return ref_text

In [7]:
def gen_final_doc(corpus_obj):
    final_doc = '===GLOGEN GLOSSARY===\n\n'
    def_count = 0
    for definition in corpus_obj['glossary']:
        def_count += 1
        final_doc += '|{}| '.format(def_count) + definition[0] + ' : ' + definition[1] + '\n'
        
    final_doc += '\n===DOCUMENT BODY===\n\n' + corpus_obj['text_w_ref']
    return final_doc

In [8]:
print(pyfiglet.figlet_format("DAIC GLOGEN"),)
print('DESCRIPTION: GLOGEN automatically generates glossaries and prepends them to given .txt files. \nENSURE: <filename>.txt == original text title.')
answer = ''
txt_address = ''
while True:
    answer = input('\nType "yes" / "no" to starting GLOGEN:\n>>>')
    if answer.lower().startswith("y"):
        txt_address += input('Type address where .txt files are stored. If same address as main.py press ENTER.\n>>>')
        break
    elif answer.lower().startswith("n"):
        exit()
    else:
        print('INVALID INPUT --- TRY AGAIN.')
        continue
if txt_address == '':
    txt_address = os.getcwd()

 ____    _    ___ ____    ____ _     ___   ____ _____ _   _ 
|  _ \  / \  |_ _/ ___|  / ___| |   / _ \ / ___| ____| \ | |
| | | |/ _ \  | | |     | |  _| |  | | | | |  _|  _| |  \| |
| |_| / ___ \ | | |___  | |_| | |__| |_| | |_| | |___| |\  |
|____/_/   \_\___\____|  \____|_____\___/ \____|_____|_| \_|
                                                            

DESCRIPTION: GLOGEN automatically generates glossaries and prepends them to given .txt files. 
ENSURE: <filename>.txt == original text title.

Type "yes" / "no" to starting GLOGEN:
>>>/home/jackragless/projects/github/DAIC_GLOGEN/text_files/
INVALID INPUT --- TRY AGAIN.

Type "yes" / "no" to starting GLOGEN:
>>>yes
Type address where .txt files are stored. If same address as main.py press ENTER.
>>>/home/jackragless/projects/github/DAIC_GLOGEN/text_files/


In [11]:
# txt_address = '/home/jackragless/projects/github/DAIC_GLOGEN/text_files'
if txt_address[-1] == '/':
    txt_address = txt_address[:-1]

user_input_data = []

for filename in os.listdir(txt_address):
    if filename.endswith('.txt'):
        print(filename[:-4])
        orig_text = open(txt_address+'/'+filename).read()
        
        processed = preprocess_utils.clean_text(orig_text, False,False,True,False,False) 
        print('PREDICTING KEYWORDS...')
        keywords_and_pos = generate_keywords(processed)
        keyword_utils.chink(keywords_and_pos)
        keywords_only = [kw[0] for kw in keywords_and_pos]
#         keywords = keyword_utils.chink(keywords)
        print('GENERATING DEFINITIONS...')
        glossary = generate_definitions(filename[:-4], processed, keywords_and_pos)
    
        user_input_data.append({
             'title':filename[:-4], 
             'orig_text':orig_text,
             'text_w_ref':add_def_refs(orig_text, keywords_only),
#              'processed':processed,
             'keywords':keywords_only,
             'glossary': glossary
            })

USens
PREDICTING KEYWORDS...


  0%|          | 0/14 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

GENERATING DEFINITIONS...
9 / 12

KeyboardInterrupt: 

In [10]:
os.mkdir(txt_address + '/GLOGEN')
for obj in user_input_data:
    text_file = open(txt_address + "/GLOGEN/{}.txt".format(obj['title']), "w")
    text_file.write( gen_final_doc(obj) )
    text_file.close()