In [222]:
import fitz
import pandas as pd
import bisect

In [223]:
def scrape(filePath, start, stop, font, fontSize):
    labels = {}
    results = [] # list of tuples that store the information as (text, font size, font name) 
    pdf = fitz.open(filePath) # filePath is a string that contains the path to the pdf
    for page in pdf.pages(start=start, stop=stop):
        dict = page.get_text("dict")
        blocks = dict["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    stack = []
                    for lines in data:
                        if (lines['text'] in ['A1', 'A2', 'B1', 'B2', 'C1']):
                            labels[lines['text']] = len(results)
                        if (lines['size'] >= fontSize[0]) & (lines['size'] <= fontSize[1]):
                            if ((lines['font'] == 'MyriadPro-Regular') | (lines['font'] == 'MyriadPro-LightIt')) & (len(lines['text']) > 1):
                                stack.append(lines['text'])

                            if lines['font'] == 'MyriadPro-LightIt':
                                results.append(stack)
                                
                            

    pdf.close()
    return results, labels

In [224]:
dfs = []

In [225]:
names = [
    'American_Oxford_3000_by_CEFR_level',
    'American_Oxford_5000_by_CEFR_level',
    'The_Oxford_3000_by_CEFR_level',
    'The_Oxford_5000_by_CEFR_level'
]

In [226]:
for name in names:
    res, labels = scrape('English_level/Oxford_CEFR_level/' + name + '.pdf', start=0, stop=13, font=None, fontSize=[9, 9])
    df = pd.DataFrame(res)
    labels_list = list(labels)
    for index in range(len(labels_list) + 1):
        if index == 0: 
            continue

        start = labels[labels_list[index - 1]]
        end = len(df) if index == len(labels_list) else labels[labels_list[index]]
        df.loc[start:end, 'label'] = labels_list[index - 1]
    
    dfs.append(df)

In [227]:
dfs[0].columns = [0, 'type', 'label']
dfs[1].columns = [0, 'type', 'label']

In [228]:
dfs[2]['type'] = dfs[2][dfs[2].columns[1:4]].apply(lambda row: ','.join( [x for x in row.values.astype(str) if x != 'None']), axis=1)


In [229]:
dfs[2] = dfs[2][[0, 'label', 'type']]

In [230]:
dfs[3]['type'] = dfs[3][dfs[3].columns[1:3]].apply(lambda row: ','.join( [x for x in row.values.astype(str) if x != 'None']), axis=1)

In [231]:
dfs[3] = dfs[3][[0, 'label', 'type']]

In [232]:
words = pd.concat(dfs).reset_index(drop=True)
words.columns = ['name', 'type', 'label']

In [233]:
columns = list(set(pd.Series(','.join(list(words['type'].astype(str))).split(',')).str.strip()))[1:]

In [234]:
words = words.join(pd.DataFrame(columns=columns, index=words.index).fillna(0))

In [254]:
def split_type(row):
    if row['type'] == None:
        return 0
    inter = list(set(row['type'].split(',')) & set(columns))
    
    if len(inter) == 0:
        return 0

    words.loc[row.name, inter] = [1] * len(inter)
    return 1


In [255]:
words.apply(split_type, axis=1)

0        0
1        1
2        1
3        1
4        1
        ..
11222    1
11223    1
11224    1
11225    1
11226    1
Length: 11227, dtype: int64

In [271]:
words = words[['name', 'type', 'label'] + list((words[columns] > 0).sum().sort_values().tail(10).index)]

In [272]:
words.to_csv('Parsed/words.csv')