In [1]:
import re
import numpy as np
import pandas as pd
import json
import nltk
from string import punctuation
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')


In [8]:
# Read json file and return content
def read_json(filepath):
    with open(filepath, 'r') as json_file:
        json_list = list(json_file)
    for json_str in json_list:
        result = json.loads(json_str)

    dataset = []
    for json_str in json_list:
        result = json.loads(json_str)
    #     print(f"result: {result}")
    #     print(isinstance(result, dict))
        dataset.append(result)
    return dataset

# Read text file and return content
def read_txt(path):
    file = open(path, "r")
    text = file.read()
    return text

# Print the menu with options to paste text or upload txt to get definitions of acronyms found in the text
def print_menu():
    print( 30 * "-" , "MENU" , 30 * "-")
    print('What would you like to do?')
    print()
    print( '1- Enter a text')
    print( '2- Upload a txt document')
    print( '3- Input Acronym to Define')
    print( '4- Exit')
    print( 67 * "-")

In [9]:
# Read the formatted_acronyms.json  which contains list of acronyms + definitions + frequency
all_acronyms_list = read_json('data/interim/formatted_acronyms.jsonl')

acronym_list = []
for a in all_acronyms_list:
    acronym = a['acronym']
    definition = a['definition']
    frequency = a['freq']
    
    if definition != "":
        # Some acronym-definition pair are marked as "different" because of capitalization and punctuations
        clean_definition = definition.lower()
        clean_definition = clean_definition.translate(str.maketrans("", "", punctuation))
        acronym_list.append([acronym, clean_definition, frequency])
        #print([acronym, clean_definition, frequency])

In [10]:
# Creaet dataframe for the acronym_list we just parsed
acronyms_dataframe = pd.DataFrame()
column_names = ['Acronym','Definition','Frequency']

acronyms_dataframe = pd.DataFrame(acronym_list, columns = column_names)
acronyms_dataframe

Unnamed: 0,Acronym,Definition,Frequency
0,NASA,national aeronautics and space administration,1643
1,NASA,national aeronautics space administration,22
2,NASA,national aeronautics and space agency,14
3,NASA,national aeronautic and space administration,10
4,NASA,national aeronautical and space administration,9
...,...,...,...
38023,ZEAL,zeus error analysis for lightning,1
38024,ZL,zodiacal light,1
38025,ZOMM,zurich optical and microphysical box model,1
38026,ZPS,zeroprebreathe suit,1


In [11]:
def tokenize_text(text):
    found_acronyms = []
    tokenized_text = nltk.word_tokenize(text)
    text_pos_tag = nltk.pos_tag(tokenized_text)
    for text in text_pos_tag:
        word = text[0]
        tag = text[1]
        # Acronyms are all upper case and 
        if word.isupper() and len(word) > 1 and tag == "NNP":
            #print(word)
            found_acronyms.append(word)
    return found_acronyms

In [16]:
def get_definitions(acronyms_dataframe, found_acronyms):
    results_list = []
    found_definitions = pd.DataFrame()
    results_dataframe = pd.DataFrame()
    print()
    print("Please note that results will be saved in same path where notebook is running in")
    path = input('Type file name to save definition results:')
    found_acronyms = set(found_acronyms)
    for a in found_acronyms:
        found_definitions = acronyms_dataframe.loc[acronyms_dataframe['Acronym'] == a]
        found_definitions = found_definitions[found_definitions.Frequency == found_definitions.Frequency.max()]
        #print(found_definitions)
        results_list.append(found_definitions)
    results_dataframe = pd.concat(results_list)
    if ".csv" not in path:
        path = path + ".csv"
    results = results_dataframe[['Acronym','Definition']]
    if results_dataframe.empty:
        print("No Acronyms Found")
    else:    
        results.to_csv(path,index=False)
        
def get_all_definitions(acronyms_dataframe, acronym):
    results_list = []
    found_definitions = pd.DataFrame()
    results_dataframe = pd.DataFrame()
    print("Please note that results will be saved in same path where notebook is running in")
    path = input('Type file name to save definition results:')
    found_definitions = acronyms_dataframe.loc[acronyms_dataframe['Acronym'] == acronym.upper()]
    #print(found_definitions)
    results_list.append(found_definitions)
    results_dataframe = pd.concat(results_list)
    if ".csv" not in path:
        path = path + ".csv"
    results = results_dataframe[['Acronym','Definition']]
    if results_dataframe.empty:
        print("No Acronyms Found")
    else:    
        results.to_csv(path,index=False)

In [17]:
#Display menu for user
loop=True        
while loop:          ## While loop which will keep going until loop = False
    print_menu()    ## Displays menu
    choice = input("Enter your choice [1-4]: ")
    if choice == '1':     
        print( "1- Enter a text")
        text = input('Type your text:')
        # Tokenize text to find acronyms then call get_definitions to save results
        print( 60 * "-")
        found_acronyms = tokenize_text(text)
        get_definitions(acronyms_dataframe, found_acronyms)
        loop = False
        
    elif choice == '2':
        print( "2- Upload a document")
        print("Please note that results will be saved in same path where notebook is running in and must be a .txt file")
        path = input('Type text file name to upload:')
        if ".txt" not in path:
            path = path + ".txt"
        file = path.split(r'/')[-1]
        # Call function to open and read txt file
        text = read_txt(path)
        print(file,'is uploaded.')
        # Tokenize text to find acronyms then call get_definitions to save results
        print( 60 * "-")
        found_acronyms = tokenize_text(text)
        get_definitions(acronyms_dataframe, found_acronyms)
        loop = False
        
    elif choice == '3':
        print( "3- Input Acronym to Define")
        acronym = input('Type the acronym to get all definition:')
        # Call function to open and read txt file
        # Tokenize text to find acronyms then call get_definitions to save results
        print( 60 * "-")
        get_all_definitions(acronyms_dataframe, acronym)
        loop = False
        
    elif choice == '4':
        print( "Exit!")
        ## You can add your code or functions here
        loop = False
        
    else:
        # Any integer inputs other than values 1-5 we print an error message
        print("Wrong option selection. Enter any key to try again..")

------------------------------ MENU ------------------------------
What would you like to do?

1- Enter a text
2- Upload a txt document
3- Input Acronym to Define
4- Exit
-------------------------------------------------------------------
Enter your choice [1-4]: 3
3- Input Acronym to Define
Type the acronym to get all definition:nasa
------------------------------------------------------------
Please note that results will be saved in same path where notebook is running in
Type file name to save definition results:nasa
