# Pipeline for pay ratio

In [1]:
import sys
import os
# adds folder ../scripts to look for module imports
sys.path.append('../scripts')

from cleaning import *
from preprocessing import *
from tqdm import tqdm

In [None]:

# Select year
year = "2020"
directory = r'../data/raw/annual_reports/{}/'.format(year)

# Loop over documents
filenames = []
company_names = []
dfs = pd.DataFrame()
for filename in tqdm(os.listdir(directory)[0:80]):
    if filename.endswith(".pdf"):
        path = directory + filename
        
        # Run pattern patcher for each document
        df = create_match_dataframe(path, 'ceo_pay_ratio')
        company_info = define_company_dictionary(path)
        dfs = dfs.append(df)
        
        
dfs.to_csv("../data/processed/training_pay_ratio_2020_check.csv", index=False)  

# Identify all matches with a true positive on the same page

In [3]:

#manually labelled data for 125 documents 
labelled = pd.read_csv('../data/processed/training_data_pay_ratio_labelled.csv')

# identified phrases identified by the spaCy PhraseMatcher for 2019 and 2020
df_2020 = pd.read_csv("../data/processed/training_pay_ratio_2020.csv")
df_2019 = pd.read_csv("../data/processed/training_pay_ratio_2019.csv")

# Combine all identified phrases identified by the spaCy PhraseMatcher
training_data = df_2019.append(df_2020,ignore_index=True)

In [4]:
# PhraseMatcher cases ought to be labelled. 
# The objective is to identify the page of occurance. 

# The hand labelled data contains the page at which information can be found
# We therefore label all PhraseMatcher output on the correct page as 1 and 
# information on different pages as 0

training_data['class'] = 0
training_data['training'] = '' 

# Loop over manually labelled documents (125)
for i in range(0,125):
        target = labelled['report'][i]
        print(target)
        
        # select all matches in one report
        conditional = labelled.loc[labelled['report'] == target,'pay_ratio_present']
        
        # If the manually labelled set includes a pay-ratio score we continue
        if int(labelled.loc[labelled['report'] == target,'pay_ratio_present']) == 1:
            print('found')
            
            #Find page of occurance
            page = labelled.loc[labelled['report'] == target,'pay_ratio_page_number_pdf']
            # Set all matches from the PhraseMatcher that occur on the above page (in the same document)
            # to class=1
            training_data.loc[(training_data['File_name'] == target) & (training_data['page'] == int(page)),'class'] = 1
            
            #document to be included in training model
            training_data.loc[(training_data['File_name'] == target),'training'] = 1
            
# training_data.to_csv("/home/jasper_h/Aug21_Pivigo_S/data/processed/training_pay_ratio_2020_new.csv", index=False)

Unilever_Annual_Report_2019.pdf
found
Taylor_Wimpey_Annual_Report_2019.pdf
found
Rightmove_Annual_Report_2019.pdf
found
United_Utilities_Annual_Report_2019.pdf
Berkeley_Group_Annual_Report_2019.pdf
found
Ocado_Annual_Report_2019.pdf
found
M&G_Annual_Report_2019.pdf
found
JD_Sports_Annual_Report_2019.pdf
Severn_Trent_Annual_Report_2019.pdf
British_American_Tobacco_Annual_Report_2019.pdf
found
AstraZeneca_Annual_Report_2019.pdf
found
ITV_Annual_Report_2019.pdf
found
Imperial_Brands_Annual_Report_2019.pdf
found
BAE_Systems_Annual_Report_2019.pdf
found
Glencore_Annual_Report_2019.pdf
found
Smurfit_Kappa_Annual_Report_2019.pdf
found
Lloyds_Banking_Group_Annual_Report_2019.pdf
found
Prudential_Annual_Report_2019.pdf
found
Whitbread_Annual_Report_2019.pdf
Next_Annual_Report_2019.pdf
found
St_James's_Place_Annual_Report_2019.pdf
found
WPP_Annual_Report_2019.pdf
found
GSK_Annual_Report_2019.pdf
found
Informa_Annual_Report_2019.pdf
found
Standard_Chartered_Annual_Report_2019.pdf
found
Intertek_G

# Classification

In [5]:
import sys

#edit directory
sys.path.insert(0, '../scripts/')
import matplotlib.pyplot as plt

from sklearn_classifier import *

In [6]:
# Train model

# not all documents have been labelled and can be used for supervised learning 
training = training_data.loc[training_data['training']==1].reset_index()

# Fit MultinomialNB classifier using stratified CV and obtain likelihoods for each phrase
model, output = classify_NLP_df(training, ngram_range=(1, 3), alpha=0.25, classifier='MultinomialNB')
training['likelihood'] = output['likelihood'][:,1]

# Note: this model is tuned to identify the page, this is reflected in output metrics. 


accuracy: 0.7464788732394366
precision: 0.7444253859348199
recall: 0.970917225950783
F1 score: 0.8427184466019418
Top ten features that predict 1 report vs 0: 
Class 1 best: 
(-9.71446307734805, ' 1,102,874')
(-9.71446307734805, ' 1,102,874 cfo')
(-9.71446307734805, ' feedback')
(-9.71446307734805, ' feedback agm')
(-9.71446307734805, ' gure ceo')
(-9.71446307734805, ' methodology')
(-9.71446307734805, ' methodology used')
(-9.71446307734805, ' renumeration')
(-9.71446307734805, ' renumeration policy')
(-9.71446307734805, ' reporting')
Class 2 best: 
(-3.587414802027043, 'pay')
(-3.847522437722416, 'ratio')
(-3.904629191222468, 'pay ratio')
(-4.839509840835343, 'ceo')
(-5.061272754593888, 'ceo pay')
(-5.255482052313955, 'ceo pay ratio')
(-5.514222662571994, 'percentile')
(-5.534223751989595, 'ratios')
(-5.59748498652079, 'pay ratios')
(-5.689874647792486, 'executive')





In [None]:
import pickle
with open('../models/ceo_pay_ratio_model.pkl', 'wb') as fp:
    pickle.dump(model, fp)