### 1. Read in word document, identify "hot button issue" by checking against pre-defined keyword list 
### 2. Return dictionary {"hot button issue": set of keywords found}

### Note: Keyword needs to include mutations such as " intervene -> intervened, intervines"

In [1]:
import sys
import os
import pickle
import re

import pandas as pd
import numpy as np

from docx import Document

import time
start_time = time.time()

In [2]:
hot_button_file = os.path.join('../../data/raw/hot_button_issues.xlsx')
text_file_path = "../../documentation/sample_docs/5138964-v5-Brazil_2013_Article_IV_Consultation_-_Policy_Note.DOCX"
save_file = os.path.join('../../data/processed/','hot_button_dict.pickle')
print(os.getcwd())

/mnt/notebook/poc


#### Import hot_button_table (manually created)

In [3]:
hot_button_df = pd.read_excel(hot_button_file)

hot_button_df.fillna('', inplace= True)

hot_button_df['keyword list'] = hot_button_df['related words selected'].str.cat(hot_button_df['augmented words from topic modelling'], sep = ', ').str.cat(hot_button_df['augmented words from word2vec'], sep = ', ').str.cat(hot_button_df['search term for word2vec'], sep = ', ')

hot_button_df['keyword list'] = hot_button_df['keyword list'].str.lower().str.replace(r'/|-|_',' ')

hot_button_df[['Hot button issues','keyword list']]

Unnamed: 0,Hot button issues,keyword list
0,Capital flow management measures,"cfm, cfm mpm, capital inflows surge, disruptiv..."
1,Exchange restrictions,"article viii, current international transactio..."
2,Multiple currency practice,"mcp, article viii, effective rate, preferentia..."
3,Corruption,"political risk, bribes, transparency, accounta..."
4,Governance,"public financial management, pfm, anti money l..."
5,Fintech/digital,"artificial intelligence, big data, blockchain,..."
6,Macroprudential measures,"mpm, cfm mpm, systemic risk, ltv, dsti, risk w..."
7,Housing,"affordability, mortgage, house prices, househo..."
8,Demographic,"population, ageing, pension, productivity, mig..."
9,Shadow banking,"non bank credit, credit guarantee, contingent ..."


#### Create dictionary mapping name and regular expression

In [8]:
hot_button_dict = pd.Series(hot_button_df['keyword list'].values,index=hot_button_df['Hot button issues']).to_dict()

hot_button_dict

for k, v in hot_button_dict.items():
    # regular expression, deal with multiplles
    ## 1. adding 's','d'at end
    ## 2. separated from other characters
    ## 3. take care of duplicated space
    hot_button_dict[k] = v.replace(', ', 's?[\s|.,]+|[\s|.,]+').replace('s?[\s|.,]+|[\s|.,]+(s)?[\s|.,]+|[\s|.,]+','|')

hot_button_dict
hot_button_dict

{'Arrears': 'financing assurancess?[\\s|.,]+|[\\s|.,]+misreporteds?[\\s|.,]+|[\\s|.,]+good faiths?[\\s|.,]+|[\\s|.,]+lending into arrears policys?[\\s|.,]+|[\\s|.,]+creditors?[\\s|.,]+|[\\s|.,]+arrearss?[\\s|.,]+|[\\s|.,]+upos?[\\s|.,]+|[\\s|.,]+sonaras?[\\s|.,]+|[\\s|.,]+emaes?[\\s|.,]+|[\\s|.,]+extrabudgetary spendings?[\\s|.,]+|[\\s|.,]+unpaids?[\\s|.,]+|[\\s|.,]+vat refunds?[\\s|.,]+|[\\s|.,]+naftogazs?[\\s|.,]+|[\\s|.,]+unprogrammeds?[\\s|.,]+|[\\s|.,]+reschedules?[\\s|.,]+|[\\s|.,]+outstanings?[\\s|.,]+|[\\s|.,]+repayments?[\\s|.,]+|[\\s|.,]+arrear',
 'Belt and road': 'silk roads?[\\s|.,]+|[\\s|.,]+bris?[\\s|.,]+|[\\s|.,]+belt and roads?[\\s|.,]+|[\\s|.,]+s?[\\s|.,]+|[\\s|.,]+belt and road',
 'Capital flow management measures': 'cfms?[\\s|.,]+|[\\s|.,]+cfm mpms?[\\s|.,]+|[\\s|.,]+capital inflows surges?[\\s|.,]+|[\\s|.,]+disruptive capital outflowss?[\\s|.,]+|[\\s|.,]+depreciation pressuress?[\\s|.,]+|[\\s|.,]+cfmss?[\\s|.,]+|[\\s|.,]+capital flow measures?[\\s|.,]+|[\\s|.,]+cfms

In [5]:
# Save and load for reuse in production

import pickle

pickle.dump(hot_button_dict, open(save_file,'wb'))

hot_button_dict = pickle.load(open(save_file, 'rb'))

#### Load Text File and get search results

In [6]:
def search_wordlist_in_paragarph(wordlist, paragarph):
    '''Check if a paragarph contains any word in list'''
    result = re.findall(wordlist, paragarph)

    return result

def read_doc(f_path,word_length_filter=20):
    '''load an clean document'''
    if os.path.isfile(f_path):
        doc = Document(f_path)
        text_list = [p.text for p in doc.paragraphs if len(p.text)>10]#[3:]
        text_list = [p.replace('\xa0',' ') for p in text_list] # some clean up 
        text_list = [p for p in text_list if len(p.split()) > word_length_filter]
    else:
        raise Exception('File does not exist: {}'.format(f_path))

    return text_list

document = read_doc(text_file_path)

## some replacement of "-/_" to space
document = list(map(lambda x: re.sub(r'—|-|_',' ',x), document))
len(document)

73

In [9]:
result = dict()

for k, v in hot_button_dict.items():
    # iterate over each topic
    found_list = list(map(lambda x: search_wordlist_in_paragarph(v, x), document))
    found_word = [y for x in found_list for y in x if x !=[]]
    if len(found_word) == 0:
        result[k] = None
    else:
        result[k] = set(list(map(lambda x: x.strip(' '),found_word)))

print(result)

print("--- %s seconds ---" % (time.time() - start_time))

{'Capital flow management measures': {'capital flow measures'}, 'Exchange restrictions': {'exchange'}, 'Multiple currency practice': None, 'Corruption': {'transparency,'}, 'Governance': None, 'Fintech/digital': None, 'Macroprudential measures': {'', '.,', '.', ',', 'macroprudential', 'macroprudential tools'}, 'Housing': {'real estate', 'mortgage', 'households', 'property', 'household'}, 'Demographic': {'insurance', 'population', 'pensions,', 'working age', 'productivity.', 'demographic', 'productivity', 'pension'}, 'Shadow banking': {', contingent liabilities'}, 'Competition policy': None, 'Foreign Exchange intervention': {'intervention'}, 'Belt and road': {'', '.,', '.', ','}, 'Arrears': {'repayment'}, 'Debt restructuring': None, 'Financing assurances': {'', '.,', '.', ','}}
--- 22.74454665184021 seconds ---


In [None]:
# test example
list(map(lambda x: x.strip(' '), ['it is good ',' bad apple']))

In [None]:
re.findall('[\b]*gr[\s.,]', r'It''s great gr .')