This notebook serves to extract information from parsed PDF text. The steps are as follows.

1. Filter sentences with numbers, new lines in them (if aim is to extract number + key metrics)
2. Remove stop words, punctuations, year etc
3. Apply part of speech tagging - generate some rules that will allow for extraction of number and metrics. To remember that negative words means that a minus needs to be added in front of the number
4. Output for each source -> The metrics, and value

# Import Packages

In [1]:
import pandas as pd
from utils import *

# Rule mining

test.txt from compiled sentences from word embeddings

In [2]:
df = pd.read_csv('../data/mining/text_ext_labelled_final.csv')
df = df.astype(str)
df = df.fillna("")
test_set_preproc_no_lemma, test_set_preproc_lemma  = [], []
for row in df.iterrows():
    line, labelled = row[1][0], [i for i in row[1][1:]]
    line = line.replace("\n", "")
    # append no lemma version
    test_set_preproc_no_lemma.append([line.lower(), preprocess(line.lower(), False), labelled])
    #lemmatize
    #append lemma version
    test_set_preproc_lemma.append([line.lower(), preprocess(line.lower(), True), labelled])

In [ ]:
def process_list_nb(test_set_preproc):
    tags = []
    for i,j,k in test_set_preproc: 
        tk_org, pos_org, tag_org = pos_extraction(i)
        tk, pos, tag = pos_extraction(j)
        tags.append([i, j, tk_org, pos, tag, k]) 
    return tags

In [None]:
tags_lemma = process_list_nb(test_set_preproc_lemma)
tags_no_lemma = process_list_nb(test_set_preproc_no_lemma)

# Method

1. Store a list, keeping the number and extracted portion
> Alternative is to store a set and disregard the number in case multiple numbers in one sentence and we dedup the extracted sentences
2. Check left and right until we hit a verb not of finer pos tag 'VBP' or 'VB'. If in between this checking there are no NOUNS, continue checking after this VERB (exclude noun directly before and after). 
3. Get the indices of the tag we need to slice to retrieve from the tokens_list (must use this method since pos now has punctuations & stopwords)

In [None]:
def extract_text(tags, verb_exclude): 
    tokens, pos_list, tag_list = tags[2], tags[3], tags[4]
    results = []
    for i in range(len(pos_list)):
        pos = pos_list[i]
        tag = tag_list[i]
        tok = tokens[i]
        if pos == 'NUM' and line_has_digits(tokens[i].text): #million recognised as a NUM
            j,k = extract_text_numbers(pos_list, tag_list, verb_exclude, i)
            results.append([tokens[i].text, generate_extracted_text(tokens, pos_list, j,k)])
        if (pos == 'DET' and tag == 'PDT') or tok.text in special_ext_tokens:
            j,k = extract_text_quant_words(pos_list, tag_list, i)
            results.append([tokens[i].text, generate_extracted_text(tokens, pos_list, j,k)])
    return results    

def extract_text_numbers(pos_list, tag_list, verb_exclude, i):
    j = max(i-1,0)
    k = min(i+1, len(pos_list)-1)    
    noun_flag_left, noun_flag_right = False, False
    if j != 0:
        while pos_list[j] != 'VERB' or noun_flag_left == False or tag_list[j] in verb_exclude:
            if pos_list[j] == 'NOUN':
                noun_flag_left = True
            j -= 1
            if j == 0:
                break
    if k != len(pos_list)-1:
        while pos_list[k] != 'VERB' or noun_flag_right == False or tag_list[k] in verb_exclude:
            if pos_list[k] == 'NOUN':
                noun_flag_right = True
            k += 1
            if k == len(pos_list)-1:
                break
    return j,k

def extract_text_quant_words(pos_list, tag_list, i):
    j = max(i-1,0)
    k = min(i+1, len(pos_list)-1)    
    adj_flag_left, adj_flag_right = False, False
    if j != 0:
        while pos_list[j] != 'NOUN' or adj_flag_left == False:
            if pos_list[j] == 'ADJ':
                adj_flag_left = True
            j -= 1
            if j == 0:
                break
    if k != len(pos_list)-1:
        while pos_list[k] != 'NOUN' or adj_flag_right== False:
            if pos_list[k] == 'ADJ':
                adj_flag_right = True
            k += 1
            if k == len(pos_list)-1:
                break
    return j,k
    
def generate_extracted_text(tokens, pos_list, j, k): #need to write if not simply joining will give extra spaces
    extracted_text = ''
    for tk in range(j,k):
        if pos_list[tk] != 'PUNCT' and pos_list[tk] != 'PART':
            extracted_text += ' '
        extracted_text += tokens[tk].text
    return extracted_text.strip()

In [None]:
def scoring_extraction(extracted_text, labelled_text, v_list):
    index = len(extracted_text)
    for i in range(index):
        v_list[0] += len(labelled_text[i].split(" "))
        v_list[1] += abs(len(extracted_text[i][1].split(" "))- len(labelled_text[i].split(" ")))
    return v_list

def print_accuracy(v):
    matched = round(100*(1-(v[1]/v[0])),2)
    print("The percentage of words accurately extracted is "+ str(matched)+ "%")

def extract_and_score_text(tags):
    l = []
    v0, v1, v2, v3 = [0,0], [0,0], [0,0], [0,0]
    for info in tags:
        result_noexclusion = extract_text(info, [])
        result_verbfg_exclusion1 = extract_text(info, ['VBP']) # best
        result_verbfg_exclusion2 = extract_text(info, ['VB'])
        result_verbfg_exclusion3 = extract_text(info, ['VBP', 'VB'])
        v0 = scoring_extraction(result_noexclusion, info[5], v0)
        v1 = scoring_extraction(result_verbfg_exclusion1, info[5], v1)
        v2 = scoring_extraction(result_verbfg_exclusion2, info[5], v2)
        v3 = scoring_extraction(result_verbfg_exclusion3, info[5], v3)
        l.append([info[0], result_noexclusion, result_verbfg_exclusion1, result_verbfg_exclusion2, result_verbfg_exclusion3])
    print_accuracy(v0)
    print_accuracy(v1)
    print_accuracy(v2)
    print_accuracy(v3)
    return l

In [None]:
print("print running lemma version")
df_lemma = extract_and_score_text(tags_lemma)
print("print running non-lemma version")
df_no_lemma = extract_and_score_text(tags_no_lemma)

In [None]:
pd.DataFrame(df_lemma).to_csv('test_final.csv')