### Import modules required

In [58]:
import pandas as pd
from PyDictionary import PyDictionary
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

from openpyxl import load_workbook
import json
from difflib import get_close_matches
import os

### Import english dictionary with level

In [59]:
'''
Convert pdf content from a file path to text
'''

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()

    with io.StringIO() as retstr:
        with TextConverter(rsrcmgr, retstr, codec=codec,
                           laparams=laparams) as device:
            with open(path, 'rb') as fp:
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                password = ""
                maxpages = 0
                caching = True
                pagenos = set()

                for page in PDFPage.get_pages(fp,
                                              pagenos,
                                              maxpages=maxpages,
                                              password=password,
                                              caching=caching,
                                              check_extractable=True):
                    interpreter.process_page(page)
                return retstr.getvalue()

if __name__ == "__main__":
    text = convert_pdf_to_txt('American_Oxford_5000.pdf')
    print(text[:455])

The Oxford 5000™ (American English)

The Oxford 5000 is an expanded core word list for advanced learners of English. As well as the Oxford 
3000, it includes an additional 2000 words for learners at B2-C1 level, which are listed here.

abolish v. C1
abortion n. C1
absence n. C1
absent adj. C1
absorb v. B2
abstract adj. B2
absurd adj. C1
abuse n., v. C1
academy n. C1
accelerate v. C1
accent n. B2
acceptance n. C1
accessible adj. C1
accidentally adv. B2


In [60]:
'''
Creating a array with the word name and level
'''
words = [] 

for line in text.split('\n'):
    words.append(line)
    
for word in range(5):
    words.pop(0)
    
formattedWords = []    
for word in words:
    if(len(word.split(' ')) < 3):
        continue
    else:
        formattedWords.append({'word': word.split(' ')[0], 'Level': word.split(' ')[2][-2:]})

df_word_level = pd.DataFrame(formattedWords)
df_word_level.head(2)

Unnamed: 0,word,Level
0,abolish,C1
1,abortion,C1


### Import english dictionary with meaning

In [61]:
# Loading data from json file
# in python dictionary
data = json.load(open("dictionary.json"))
  
def translate(w):
    # converts to lower case
    w = w.lower()
  
    if w in data:
        return data[w]
    # for getting close matches of word
    elif len(get_close_matches(w, data.keys())) > 0:             
        yn = input("Did you mean % s instead? Enter Y if yes, or N if no: " % get_close_matches(w, data.keys())[0])
        yn = yn.lower()
        if yn == "y":
            return data[get_close_matches(w, data.keys())[0]]
        elif yn == "n":
            return "The word doesn't exist. Please double check it."
        else:
            return "We didn't understand your entry."
    else:
        return "The word doesn't exist. Please double check it."
  

word = input("Enter word: ")
output = translate(word)

if type(output) == list:
    for item in output:
        print(item)
else:
        print(output)
input('Press ENTER to exit')

Enter word: 
The word doesn't exist. Please double check it.
Press ENTER to exit


''

### Import kindle notes

In [62]:
wb = load_workbook(filename='Lean_In_ Women_Work.xlsx', 
                   read_only=True)

In [63]:
ws = wb['kindle']

# Read the cell values into a list of lists
data_rows = []
for row in ws['D9':'D150']:
    data_cols = []
    for cell in row:
        data_cols.append(cell.value)
    data_rows.append(data_cols)

# Transform into dataframe
df_words_book = pd.DataFrame(data_rows)
df_words_book['chapter'] = 1
df_words_book = df_words_book.rename(columns = {0: 'word'}, inplace = False)

In [64]:
df_words_book.head(2)

Unnamed: 0,word,chapter
0,lumps,1
1,propped up,1


### Consolidate words with level

In [65]:
df_final = pd.merge(df_words_book,df_word_level,on='word',how='left')
df_final.head(5)

Unnamed: 0,word,chapter,Level
0,lumps,1,
1,propped up,1,
2,sprinted,1,
3,lumbering,1,
4,strewn,1,


In [54]:
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
                       truncate_sheet=False, 
                       **to_excel_kwargs):

    # Excel file doesn't exist - saving and exiting
    if not os.path.isfile(filename):
        df.to_excel(
            filename,
            sheet_name=sheet_name, 
            startrow=startrow if startrow is not None else 0, 
            **to_excel_kwargs)
        return
    
    # ignore [engine] parameter if it was passed
    if 'engine' in to_excel_kwargs:
        to_excel_kwargs.pop('engine')

    writer = pd.ExcelWriter(filename, engine='openpyxl', mode='a')

    # try to open an existing workbook
    writer.book = load_workbook(filename)
    
    # get the last row in the existing Excel sheet
    # if it was not specified explicitly
    if startrow is None and sheet_name in writer.book.sheetnames:
        startrow = writer.book[sheet_name].max_row

    # truncate sheet
    if truncate_sheet and sheet_name in writer.book.sheetnames:
        # index of [sheet_name] sheet
        idx = writer.book.sheetnames.index(sheet_name)
        # remove [sheet_name]
        writer.book.remove(writer.book.worksheets[idx])
        # create an empty sheet [sheet_name] using old index
        writer.book.create_sheet(sheet_name, idx)
    
    # copy existing sheets
    writer.sheets = {ws.title:ws for ws in writer.book.worksheets}

    if startrow is None:
        startrow = 0

    # write out the new sheet
    df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)

    # save the workbook
    writer.save()
    writer.close()

In [57]:
append_df_to_excel('lean_in_book.xlsx', df_final, sheet_name='kindle', startrow=None,
                       index=False,header=False)

-----------------------------------

In [66]:
### Check meaning and level [ // development]

In [9]:
# data = json.load(open("dictionary.json"))
# words_book_list = df_words_book.word.tolist()

# def closeMatches(patterns, word): 
# #     print(get_close_matches(word, patterns))
  
#     if __name__ == "__main__":
#         matches = []    
#         patterns = data
#         for word in words_book_list:
#             closeMatches(patterns, word)
#             matches.append(cell.value)   

# dict = PyDictionary() 
# dictionary=PyDictionary("strewn")
# 'There can be any number of words in the Instance'

# meaning = dictionary.printMeanings()