<a href="https://colab.research.google.com/github/julianikulski/director-experience/blob/main/preprocessing/csr_experience_ETL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preparation for the machine learning model

This file reads in the biographies from S&P Capital IQ and the manually researched DEF 14A statement biographies and chooses 150 random samples from the S&P Capital IQ dataset and 50 random samples from the DEF 14A dataset as the training sample for the fine-tuning of the Longformer model.

These 200 training samples will be manually reviewed and social and/or environmental experience will be flagged so that this dataset can then be used to train the Longformer model which will classify the remaining biographies.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
from glob import glob
import re
import math
from numpy.random import RandomState


In [None]:
# diplay columns without truncation
pd.set_option('display.max_columns', 500)
# diplay rows without truncation
# pd.set_option('display.max_rows', 1000)

## Reading in data

In [None]:
# reading in the excel files with the directors' biographies
file_path = r''
all_files = glob(file_path + '/content/drive/My Drive/director-csr/directors/*.xls')

list_df = []

for file in all_files:
    df_file = pd.read_excel(file, skiprows=7) # skipping the first 7 rows above the header
    list_df.append(df_file)
    
df_directors = pd.concat(list_df, axis=0, ignore_index=True)


In [None]:
# reading in the constituents file of the S&P 500 Index
comp_excel = pd.ExcelFile('/content/drive/My Drive/director-csr/Reuters/SP500.xlsx')
sheet_names = ['2015', '2014', '2013', '2012', '2011']

df_dict = {}
for sheet in sheet_names:
    df_temp = pd.read_excel(comp_excel, sheet) 
    df_dict[sheet] = df_temp

df_dict.keys()


dict_keys(['2015', '2014', '2013', '2012', '2011'])

In [None]:
df_dict['2015'].head()

Unnamed: 0,Type,NAME,TICKER SYMBOL,NAME.1,ISIN CODE
0,891399,AMAZON.COM,AMZN,AMAZON.COM,US0231351067
1,916328,ABBOTT LABORATORIES,ABT,ABBOTT LABORATORIES,US0028241000
2,545101,AES,AES,AES,US00130H1059
3,906187,INTERNATIONAL BUS.MCHS.,IBM,INTERNATIONAL BUS.MCHS.,US4592001014
4,749382,ADOBE (NAS),ADBE,ADOBE (NAS),US00724F1012


## Merging biographies and companies

In [None]:
# renaming the dataframe columns for the 2015 constitutents list of df_dict
df_sp500 = df_dict['2015'].rename(columns={'TICKER SYMBOL': 'ticker', 'NAME.1': 'comp_name'})

# dropping irrelevant and duplicate columns
df_sp500.drop(columns=['Type', 'NAME', 'ISIN CODE'], inplace=True)

# convert all comp_name to lower case
df_sp500['comp_name'] = df_sp500['comp_name'].apply(lambda x: x.lower())


In [None]:
df_sp500.head()

Unnamed: 0,ticker,comp_name
0,AMZN,amazon.com
1,ABT,abbott laboratories
2,AES,aes
3,IBM,international bus.mchs.
4,ADBE,adobe (nas)


In [None]:
# first 5 rows of the dataframe
df_directors.head()


Unnamed: 0,Person Name,Company Name [Any Professional Record] [Current Matching Results],Exchange:Ticker,Email Address,Professional Titles [Any Professional Record] [Current Matching Results],Colleges/Universities,Degrees,Graduation Year,Majors,Geographic Locations [Any Professional Record] [Current Matching Results],Primary Professional Record,Biographies,Person Locations [Any Professional Record] [Current Matching Results],Person Age,Person Name First,Person Name Last,Person Name Middle,Person Name Nickname,Person Name Prefix,Person Name Suffix,Person Notes,Specialties [Any Professional Record] [Current Matching Results],Year Born,CIK [Any Professional Record] [Current Matching Results],Company CUSIP [Any Professional Record] [Current Matching Results],Primary ISIN [Any Professional Record] [Current Matching Results],Security Tickers [Any Professional Record] [Current Matching Results],SIC Codes (Primary) [Any Professional Record] [Current Matching Results],Company Type [Any Professional Record] [Current Matching Results],Professional Job Functions [Any Professional Record] [Current Matching Results]
0,"Schwarzman, Stephen Allen (Prior Board)",PJT Partners Inc. (NYSE:PJT),NYSE:PJT,Schwarzman@blackstone.com,Former Chairman and Chief Executive Officer,Harvard Business School; Yale University; Quin...,Harvard Business School - MBA; Yale University...,Quinnipiac University (2012),-,United States and Canada (Primary),The Blackstone Group L.P. (NYSE:BX) (Board),"Mr. Stephen Allen Schwarzman, also known as St...",United States of America; Northeast; New York;...,68,Stephen,Schwarzman,Allen,Steve,Mr.,-,,-,1947,0001626115,69343T,US69343T1079,NYSE:PJT; BST:1PJ; DB:1PJ,6282 Investment advice,Public Company,Chief Executive Officer (Prior)
1,"Bovender, Jack O. (Prior Board)","HCA Holdings, Inc. (NYSE:HCA)",NYSE:HCA,-,Former Executive Chairman and Chairman of Exec...,Duke University,Duke University - Bachelor's Degree; Duke Univ...,Duke University (1967),Duke University - Psychology,United States and Canada (Primary),Duke University (Board),"Mr. Jack O. Bovender, Jr., served as the Chair...",United States of America; Southeast; Tennessee...,70,Jack,Bovender,O.,-,Mr.,Jr.,,-,1945,0000311314; 0000732872; 0000860730; 0001392778,40412C,US40412C1018,NYSE:HCA; BAYB:2BH,8062 General medical and surgical hospitals,Public Company,Chief Executive Officer (Prior)
2,"Mandaric, Milan (Prior Board)",Elexsys International,-,,Former Chairman of the Board and Chief Executi...,-,-,-,-,United States and Canada (Primary),"MM Holdings International, Inc. (Board)",Mr. Milan Mandaric serves as Chief Executive O...,United States of America; California; West Coa...,77,Milan,Mandaric,-,-,Mr.,-,,-,1938,0000727010,28626C,-,-,3672 Printed circuit boards,Public Company,Chief Executive Officer (Prior)
3,"Childs, John W. (Prior Board)",JWC Acquisition Corp.,-,jchilds@jwchilds.com,Chairman and Chief Executive Officer,Yale University; Columbia University,Yale University - BA; Columbia University - MBA,-,-,United States and Canada (Primary),"J.W. Childs Associates, L.P. (Board)",Mr. John W. Childs serves as the Chairman and ...,United States of America; Northeast; Massachus...,73,John,Childs,W.,-,Mr.,-,,-,1942,0001498157,46634Y,US46634Y1029,-,9995 Non-operating establishments,Public Company,Chief Executive Officer (Prior)
4,"Vota, John P. (Prior)","Insight Management Corporation, Prior to Rever...",-,-,Former Interim Chief Executive Officer and Int...,Columbia University; Fordham University; Schoo...,Columbia University - Bachelor's Degree; Fordh...,-,-,United States and Canada (Primary),Blackbird Capital Partners,Mr. John P. Vota serves as a Managing Partner ...,United States of America; Northeast; New York;...,76,John,Vota,P.,-,Mr.,-,,-,1939,-,45776Q,US45776Q3074,-,-,Public Company,Chief Executive Officer (Prior)


In [None]:
# renaming some df columns
df_directors = df_directors.rename(columns={'Person Name': 'name',
                                            'Company Name [Any Professional Record] [Current Matching Results]': 'comp_name',
                                            'Exchange:Ticker': 'ticker',
                                            'Biographies': 'bio'
                                            })

# dropping unnecessary columns for now
df_dir_upper = df_directors[['name', 'ticker', 'bio']]


In [None]:
# list of all the columns in the directors dataframe
list(df_directors.columns)


['name',
 'comp_name',
 'ticker',
 'Email Address',
 'Professional Titles [Any Professional Record] [Current Matching Results]',
 'Colleges/Universities',
 'Degrees',
 'Graduation Year',
 'Majors',
 'Geographic Locations [Any Professional Record] [Current Matching Results]',
 'Primary Professional Record',
 'bio',
 'Person Locations [Any Professional Record] [Current Matching Results]',
 'Person Age',
 'Person Name First',
 'Person Name Last',
 'Person Name Middle',
 'Person Name Nickname',
 'Person Name Prefix',
 'Person Name Suffix',
 'Person Notes',
 'Specialties [Any Professional Record] [Current Matching Results]',
 'Year Born',
 'CIK [Any Professional Record] [Current Matching Results]',
 'Company CUSIP [Any Professional Record] [Current Matching Results]',
 'Primary ISIN [Any Professional Record] [Current Matching Results]',
 'Security Tickers [Any Professional Record] [Current Matching Results]',
 'SIC Codes (Primary) [Any Professional Record] [Current Matching Results]',
 'Com

In [None]:
# check how many entries don't have a company ticker
df_dir_upper[df_dir_upper['ticker'] == '-'].count()


name      11641
ticker    11641
bio       11641
dtype: int64

In [None]:
# clean up the ticker column and remove the stock exchange information
df_dir_upper.loc[:, 'ticker'] = df_dir_upper['ticker'].apply(lambda x: x.split(':')[1] if ':' in x else x).copy()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [None]:
# how many unique companies are included in this dataset
comp_numb = len(df_dir_upper['ticker'].unique().tolist())
print('Number of unique companies in dataframe:', comp_numb)


Number of unique companies in dataframe: 10846


In [None]:
# how many directors are in this dataframe
df_dir_upper.shape


(38827, 3)

In [None]:
# checking for duplicate entries
print('Number of unique directors in dataframe:', len(df_dir_upper['bio'].unique()))


Number of unique directors in dataframe: 32424


In [None]:
# creating new dataframe that only includes the directors of S&P500 companies
df_dir_sp500 = df_sp500.merge(df_dir_upper, on='ticker')
df_dir_sp500


Unnamed: 0,ticker,comp_name,name,bio
0,AMZN,amazon.com,"Bezos, Jeffrey P. (Board)","Mr. Jeffrey P. Bezos, also known as Jeff found..."
1,ABT,abbott laboratories,"White, Miles D. (Board)",Mr. Miles D. White serves as the Chairman and ...
2,ABT,abbott laboratories,"Livingston, John (Prior Board)",Mr. John Livingston served as the Chief Execut...
3,ABT,abbott laboratories,"Haydon, Geoff (Board)",Mr. Geoff Haydon has been the Chief Executive ...
4,ABT,abbott laboratories,"Olsen, Errol",Mr. Errol Olsen has been the Chief Financial O...
...,...,...,...,...
1409,MNST,monster beverage,"Sacks, Rodney C. (Board)","Mr. Rodney C. Sacks, H.Dip.Law, H.Dip.Tax has ..."
1410,BXLT,baxalta dead - delist.03/06/16,"Hantson, Ludwig N. (Board)","Dr. Ludwig N. Hantson, Ph.D. serves as the Chi..."
1411,TGNA,tegna,"Martore, Gracia C. (Board)",Ms. Gracia C. Martore has been the President o...
1412,TGNA,tegna,"Dubow, Craig A. (Prior Board)",Mr. Craig A. Dubow served as Chief Executive O...


### Writing the merged S&P500 and director biographies dataframe to an Excel file


In [None]:
# writing to Excel file
df_dir_sp500.to_excel('/content/drive/My Drive/director-csr/sp500_biographies_2015.xlsx')


## Randomly selecting 100 examples for manual review and train, val, test datasets

In [None]:
# generate 100 numbers randomly
number_42 = RandomState(42)
train_100 = number_42.randint(0,1413,100)
train_100 = list(train_100)
print(train_100)


[1126, 860, 1294, 1130, 1095, 1044, 121, 466, 1238, 330, 87, 1396, 1123, 871, 130, 1332, 769, 343, 805, 385, 1215, 955, 276, 1184, 459, 1337, 21, 252, 747, 856, 474, 1082, 510, 699, 975, 189, 957, 686, 957, 562, 1267, 831, 1154, 646, 20, 840, 166, 1297, 387, 600, 315, 13, 241, 776, 1369, 564, 897, 1363, 91, 1390, 955, 508, 775, 34, 205, 1104, 1411, 1025, 1021, 565, 1129, 702, 401, 729, 161, 201, 995, 269, 815, 1294, 455, 1275, 1016, 295, 719, 337, 878, 1076, 791, 216, 763, 187, 379, 492, 1064, 1180, 14, 64, 520, 1367]


In [None]:
# check for duplicates
len(set(train_100))
# remove duplicates
train_100 = list(set(train_100))



Unfortunately, the above list of random integers contains 3 duplicates which were not filtered out before the manual review started. Therefore, a list of three random integers will be generated to add these to the list of 97 to get a full 100 samples.

In [None]:
# generate additional 3 random integers
number_21 = RandomState(21)
train_3 = number_21.randint(0,1413,3)
train_3 = list(train_3)
train_3


[969, 1231, 772]

In [None]:
# check the two lists for duplicates now
train_100.extend(train_3)
len(set(train_100))


100

In [None]:
# add an additional of 50 examples because 100 examples were too little
number_13 = RandomState(13)
train_50 = number_13.randint(0,1413,55)
train_50 = list(train_50)
# check for duplicates in train_50
len(set(train_50))


53

In [None]:
# check for duplicates between the previous 100 examples and the 50 new examples
duplicates = set(train_100) & set(train_50)
print('Duplicates in train_50:', duplicates)
# remove the duplicates from train_50
train_50 = set(train_50) - set(duplicates)
print('Unique values in train_50 after deletion:', len(set(train_50)))
print('Duplicates in both train_100 and train_50 after deletion:', set(train_50) & set(train_100))


Duplicates in train_50: {1184, 1267, 166}
Unique values in train_50 after deletion: 50
Duplicates in both train_100 and train_50 after deletion: set()


Because the above list of numbers contains 53 items, the three duplicates can be disregarded and we will still have 150 examples in total

In [None]:
# combine the 100 with the 50 new examples
train_100.extend(train_50)
# get the randomly chosen 
train_df = df_dir_sp500.iloc[train_100,:]
train_df.head()


Unnamed: 0,ticker,comp_name,name,bio
1025,STT,state street,"Maier, Robert K. (Prior Board)",Mr. Robert K. Maier has been the President of ...
520,HAR,harman intl.inds. dead - delist.13/03/17,"Girod, Bernard A. (Prior Board)",Mr. Bernard A. Girod served as Chief Executive...
13,ADBE,adobe (nas),"Chizen, Bruce R. (Prior Board)",Mr. Bruce R. Chizen is a Venture Partner and M...
14,ADBE,adobe (nas),"Narayen, Shantanu (Board)",Mr. Shantanu Narayen has been Chief Executive ...
1044,STI,suntrust banks dead - delist.09/12/19,"Rogers, William Henry (Board)","Mr. William Henry Rogers, also known as Bill, ..."


In [None]:
# final check for duplicates
len(train_df.index)


150

### Write biography review sample from S&P Capital IQ to Excel file

An important thing to note: I will export this dataframe to an excel file and manually review these biographies to get training and testing data sets.

I will tag everything as 1 in the target values if the words match something that could be CSR-related. Even if it is green washing or could be green washing. The machine won't know that just from the words. Once I have tagged, trained, tested, and run the model and I get results back, then I will check whether the flagged people are actually green/social.

In 1282 "adult literacy and workforce development" was found and coded as 1 for social. This should be included in the keyword list in the thesis.

In [None]:
# write the dataframe to an Excel file
train_df_bio = train_df[['bio']]
train_df_bio.to_excel('/content/drive/My Drive/director-csr/review_data/train_150.xlsx')


### Create manual review sample from director data that was manually researched from DEF 14As

When I first created the above manual review sample of biographies from the S&P Capital IQ biography dataset, I was under the impression that all of the relevant directors in my overall analysis sample were included in it. However, it turned out after I gathered the board membership data and cleaned the data (in the `biography_matching.ipynb` notebook) that only about ~ 1200 directors were included. Therefore, I had to manually research the remaining ~ 4100 directors and their biographies from DEF 14As. 

The structure, layout and content of the biographies from the S&P Capital IQ dataset and the DEF 14As are very similar. Because certain formulations differ, such as mentioning of committee memberships, I will create another manual review sample from the DEF 14A biographies including 50 samples so that my machine learning model will have seen both types of biographies in the training phase.


In [None]:
# read in the overall director sample including their biographies
dir_sample_df = pd.read_excel('/content/drive/My Drive/director-csr/complete_sample.xlsx')
dir_sample_df.drop(columns=['Unnamed: 0'], inplace=True)
dir_sample_df.head()


Unnamed: 0,name,age,last_position,director_start,director_end,executive_start,executive_end,comp_name,ticker,missing_start_date,2011,2012,2013,2014,2015,current_position,dir_exec,in_position,isin,org_name,unique_dir_id,all_years,biographies,board_committee,committee,comm_type,comm_start,comm_end,list_years_if_non_consecutive,2011_comm,2012_comm,2013_comm,2014_comm,2015_comm
0,christina gold,72.0,independent director,0.0,0.0,1997.0,2020.0,itt inc,itt,0.0,1,1,1,1,1,,,,us45073v1089,ms. christina gold,7917,no,Mrs. Christina A. Gold has been the Chief Exec...,,,,,,,0,0,0,0,0
1,frank macinnis,72.0,independent chairman of the board,2011.0,2020.0,2001.0,2020.0,itt inc,itt,0.0,1,1,1,1,1,,,,us45073v1089,mr. frank macinnis,3325,no,Mr. Frank T. MacInnis serves as the President ...,,,,,,,0,0,0,0,0
2,denise ramos,63.0,"president, chief executive officer, director",2011.0,2019.0,2011.0,2019.0,itt inc,itt,0.0,1,1,1,1,1,,,,us45073v1089,ms. denise ramos,7996,no,Ms. Denise L. Ramos serves as the Chief Execut...,,,,,,,0,0,0,0,0
3,orlando ashford,51.0,,,,,,itt inc,itt,,0,1,1,1,1,independent director,2012.0,2012.0,us45073v1089,mr. orlando ashford,5733,no,"Orlando D. Ashford, 47, has served as the Pres...",,,,,,,0,0,0,0,0
4,donald defosset,72.0,,,,,,itt inc,itt,,0,1,1,1,1,independent director,2012.0,2012.0,us45073v1089,"mr. donald (don) defosset , jr.",2984,no,"Mr. Donald DeFosset, Jr., also known as Don, B...",,,,,,,0,0,0,0,0


In [None]:
# shape of the director dataframe
dir_sample_df.shape


(6595, 34)

In [None]:
# clean the sp500 bio data so that it will compare to the cleaned bios in the dir_sample_df
train_df['bio'] = train_df['bio'].apply(lambda x: x.replace('\n', ' '))
train_df['bio'] = train_df['bio'].apply(lambda x: x.replace('\t', ' '))
train_df['bio'] = train_df['bio'].apply(lambda x: re.sub('\s+', ' ', x).strip())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
# how many unique directors are included in this sample
num_unique_dirs = len(list(dir_sample_df['unique_dir_id'].unique()))
print('Unique directors included in dataset:', num_unique_dirs)
unique_dirs_df = dir_sample_df[~dir_sample_df.duplicated(subset=['unique_dir_id'], keep='first')]
# shape of the dataframe including bios used in SP Capital IQ review sample
print(unique_dirs_df.shape)
unique_dirs_df = unique_dirs_df[~unique_dirs_df['biographies'].isin(train_df['bio'].values)]
# shape of the dataframe after removing the bios used in SP Capital IQ review sample
print(unique_dirs_df.shape)
# directors not in the S&P Capital IQ dataset
num_dirs_new = unique_dirs_df.shape[0]
# all unique indices
unique_index = unique_dirs_df.index
unique_index

Unique directors included in dataset: 5321
(5321, 34)
(5249, 34)


Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            6576, 6582, 6583, 6584, 6585, 6586, 6587, 6590, 6591, 6592],
           dtype='int64', length=5249)

In [None]:
# randomly select 150 samples
number_21 = RandomState(21)
train_50 = number_21.randint(0,num_dirs_new,50)
train_50 = list(train_50)


In [None]:
print(train_50)

[772, 48, 1144, 4706, 4298, 840, 1646, 447, 829, 233, 1789, 2352, 3310, 2916, 5094, 2644, 2935, 1208, 681, 486, 1765, 4146, 952, 1772, 2530, 1797, 2416, 2296, 929, 780, 2288, 3974, 4954, 5148, 3209, 464, 1157, 4331, 3001, 4484, 3870, 1766, 2969, 1440, 2810, 1754, 1136, 4505, 3279, 4277]


In [None]:
# get the review sample
train_50_review = unique_dirs_df.iloc[train_50, :]
train_50_review.head()


Unnamed: 0,name,age,last_position,director_start,director_end,executive_start,executive_end,comp_name,ticker,missing_start_date,2011,2012,2013,2014,2015,current_position,dir_exec,in_position,isin,org_name,unique_dir_id,all_years,biographies,board_committee,committee,comm_type,comm_start,comm_end,list_years_if_non_consecutive,2011_comm,2012_comm,2013_comm,2014_comm,2015_comm
794,bonnie brooks,66.0,independent director,0.0,0.0,2014.0,2018.0,abercrombie & fitch co,anf,0.0,0,0,0,1,1,,,,us0028962076,ms. bonnie brooks,7859,no,"Since February 2014, Ms. Brooks has served as ...",,,,,,,0,0,0,0,0
48,mary dillon,58.0,,,,,,ulta beauty inc,ulta,,0,0,1,1,1,"chief executive officer, director",2013.0,2013.0,us90384s3031,ms. mary dillon,8485,no,Ms. Mary N. Dillon has been the Chief Executiv...,,,,,,,0,0,0,0,0
1199,thomas lynch,58.0,"chief scientific officer, executive vice presi...",2017.0,2019.0,2014.0,2017.0,bristol-myers squibb co,bmy,0.0,0,0,0,1,1,,,,us1101221083,"dr. thomas (tom) lynch , jr. m.d. ph.d.",999,no,"Dr. Lynch, age 55, has served as Chairman and ...",yes,directors and corporate governance committee,"social, environmental",2014.0,2015.0,,0,0,0,1,1
5752,irene miller,68.0,lead independent outside director,0.0,0.0,2001.0,2014.0,tapestry inc,tpr,0.0,1,1,1,1,0,,,,us8760301072,ms. irene miller,8138,no,Irene Miller has served as a member of Coach’s...,,,,,,,0,0,0,0,0
5194,john pinkerton,65.0,director,1990.0,2013.0,1988.0,2016.0,range resources corp,rrc,0.0,1,1,1,1,1,,,,us75281a1097,mr. john pinkerton,4599,no,John H. Pinkerton became a director in 1988 an...,,,,,,,0,0,0,0,0


In [None]:
# write the dataframe to an Excel file
train_50_review = train_50_review[['biographies']]
train_50_review.to_excel('/content/drive/My Drive/director-csr/review_data/train_second_50.xlsx')
