<a href="https://colab.research.google.com/github/julianikulski/director-experience/blob/main/model/baseline_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline models 
This file serves two main purposes. First, it creates various baseline models that can be used as a comparison for the performance of the deep learning NLP model which is used to classify director biographies as having environmental and/or social experience. Second, it determines the maximum length that the NLP model needs to accept based on the length of the biographies in the company sample.

In [None]:
# connecting to Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# installing necessary packages
!pip install transformers==4.2.2 &> /dev/null
!pip install pytorch-lightning==1.2.6 &> /dev/null


In [None]:
import numpy as np
import pandas as pd
from glob import glob
import re
import math
from numpy.random import RandomState
from tqdm.notebook import tqdm
from datetime import datetime

from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, precision_recall_curve, fbeta_score
from sklearn.dummy import DummyClassifier

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup, AdamW, pipeline
import pytorch_lightning as pl

import torch


In [None]:
# diplay columns without truncation
pd.set_option('display.max_columns', 500)


## Reading in reviewed data

The review will happen outside of Colab. After I am finished with the manual review, I will upload the file with the two target values here. There are two manual review files. One contains 150 samples from the S&P Capital IQ biographies dataset and one contains an additional 50 samples from the manual DEF 14A review.

In [None]:
# read in first review data file
bio_excel = pd.ExcelFile('/content/drive/My Drive/director-csr/review_data/train_rev.xlsx')
sheet_names = ['reviewed', 'explanations']

df_reviewed = {}
for sheet in sheet_names:
    df_temp = pd.read_excel(bio_excel, sheet) 
    df_reviewed[sheet] = df_temp

df_reviewed.keys()


dict_keys(['reviewed', 'explanations'])

In [None]:
# read in the second review data file
bio_50_rev_df = pd.read_excel('/content/drive/My Drive/director-csr/review_data/train_second_rev.xlsx')
bio_50_rev_df.drop(columns=['Unnamed: 0'], inplace=True)
bio_50_rev_df.rename(columns={'biographies': 'bio'}, inplace=True)
bio_50_rev_df = bio_50_rev_df[['bio', 'social', 'environmental']]
bio_50_rev_df.head()


Unnamed: 0,bio,social,environmental
0,"Since February 2014, Ms. Brooks has served as ...",1,0
1,Ms. Mary N. Dillon has been the Chief Executiv...,1,0
2,"Dr. Lynch, age 55, has served as Chairman and ...",0,0
3,Irene Miller has served as a member of Coach’s...,0,0
4,John H. Pinkerton became a director in 1988 an...,0,0


In [None]:
# show the list of all reviewed examples
df_bio_tags = df_reviewed['reviewed'][['bio', 'social', 'environmental']]
df_bio_tags.head()


Unnamed: 0,bio,social,environmental
0,"Mr. Matthew E. Massengill, also known as Matt,...",1,1
1,"Mr. Michael H. Dilger, also known as Mick, has...",0,0
2,Mr. John R. Hislop has been Chief Executive Of...,0,0
3,Mr. John F. Coyne served as the Chief Executiv...,0,0
4,"Mr. John J. Koraleski, also known as Jack, ser...",1,0


In [None]:
# combine both review datasets
all_rev_df = df_bio_tags.append(bio_50_rev_df)
# shape of the dataset
len(list(all_rev_df['bio'].unique()))


200

In [None]:
# write the all_rev_df as a csv file so that it can be easily read in the training notebook
all_rev_df.to_csv('/content/drive/My Drive/director-csr/review_data/all_200_rev.csv')


## Preprocessing

The steps are based on https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb#scrollTo=voWiw8C7IrJV

In [None]:
# look at the balance of the tags
all_rev_df.describe()


Unnamed: 0,social,environmental
count,200.0,200.0
mean,0.335,0.1
std,0.473175,0.300753
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,0.0
max,1.0,1.0


In [None]:
# check how many occurences of environmental, social and both tags appear in the dataset
def show_share(df):
    '''
    Function to show the share of occurences of classes in the dataframe
    Args: df = dataframe with classes
    Returns: df_share = dataframe containing shares
    '''
    df_share = df.groupby(['environmental', 'social'], as_index=False).count()
    df_share['share'] = df_share.apply(lambda x: x['bio'] / len(df), axis=1)
    
    return df_share


In [None]:
# check the proportions of the original dataset
show_share(all_rev_df)


Unnamed: 0,environmental,social,bio,share
0,0,0,129,0.645
1,0,1,51,0.255
2,1,0,4,0.02
3,1,1,16,0.08


In [None]:
# split data into training and testing set
# because there is only 1 case where environmental is 1 and social is 0, I cannot
# stratify the datasets for both columns
# I looked at the distribution and this seems to be as close to the original distribution
# as possible with a small size like this
train_val, test = train_test_split(all_rev_df, test_size=0.2, random_state=42, stratify=all_rev_df['environmental'])
train, val = train_test_split(train_val, test_size=0.2, random_state=42, stratify=train_val['environmental'])
print(show_share(train_val))


   environmental  social  bio    share
0              0       0  100  0.62500
1              0       1   44  0.27500
2              1       0    3  0.01875
3              1       1   13  0.08125


In [None]:
# check the proportions of the new train, val, and test datasets
print(show_share(train))
print(show_share(val))
print(show_share(test))


   environmental  social  bio     share
0              0       0   79  0.617188
1              0       1   36  0.281250
2              1       0    3  0.023438
3              1       1   10  0.078125
   environmental  social  bio    share
0              0       0   21  0.65625
1              0       1    8  0.25000
2              1       1    3  0.09375
   environmental  social  bio  share
0              0       0   29  0.725
1              0       1    7  0.175
2              1       0    1  0.025
3              1       1    3  0.075


## Reading in entire biography data set for later testing and for pre-processing


In [None]:
# read in file
all_bios_df = pd.read_csv('/content/drive/My Drive/director-csr/complete_sample_no_missing.csv')
all_bios_df.drop(columns=['Unnamed: 0'], inplace=True)
# remove all duplicate director entries from this dataset
all_bios_df_unique = all_bios_df.drop_duplicates(subset=['unique_dir_id'])
all_bios_df_unique.head()


Unnamed: 0,name,age,last_position,director_start,director_end,executive_start,executive_end,comp_name,ticker,missing_start_date,2011,2012,2013,2014,2015,current_position,dir_exec,in_position,isin,org_name,unique_dir_id,all_years,bio,board_committee,committee,comm_type,comm_start,comm_end,list_years_if_non_consecutive,2011_comm,2012_comm,2013_comm,2014_comm,2015_comm
0,james mccann,68.0,independent director,0.0,0.0,2004.0,2019.0,willis towers watson plc,wltw,0.0,1,1,1,1,1,,,,ie00bdb6q211,"mr. james mccann , iii",4129,no,"Mr. James F. McCann, also known as Jim, Founde...",,,,,,,0,0,0,0,0
1,jeffrey ubben,58.0,independent director,0.0,0.0,2013.0,2017.0,willis towers watson plc,wltw,0.0,0,0,1,1,1,,,,ie00bdb6q211,mr. jeffrey (jeff) ubben,4249,no,"Mr. Ubben, age 54, joined the Willis Towers Wa...",,,,,,,0,0,0,0,0
2,dominic casserley,58.0,"president, deputy chief executive officer, dir...",2013.0,2016.0,2013.0,2016.0,willis towers watson plc,wltw,0.0,0,0,1,1,1,,,,ie00bdb6q211,mr. dominic casserley,2968,no,"Mr. Casserley, age 58, has served as President...",,,,,,,0,0,0,0,0
3,sir roy gardner,,independent director,0.0,0.0,2006.0,2015.0,willis towers watson plc,wltw,0.0,1,1,1,1,1,,,,ie00bdb6q211,sir roy gardner,9219,no,"Sir Roy Gardner, age 69, joined the Board on A...",,,,,,,0,0,0,0,0
4,sir jeremy hanley,72.0,independent director,0.0,0.0,2006.0,2015.0,willis towers watson plc,wltw,0.0,1,1,1,1,1,,,,ie00bdb6q211,sir jeremy hanley,9202,no,"Sir Jeremy Hanley, age 69, joined the Board on...",,,,,,,0,0,0,0,0


In [None]:
# number of unique directors with unique bios in the dataset
all_bios_df_unique.shape


(5276, 34)

# Creating baseline models

Two different baseline models will be used to compare my model results to later on. The first model takes the Dummy estimator from sklearn and uses its stratified option which makes random predictions in accordance with the training set distribution of class labels.

The second model is the [zero-shot-classification model](https://github.com/huggingface/transformers/pull/5760) implemented in a pipeline version by Hugging Face. This model can predict user-specified labels for text which the model was not trained on. Multi-label classification is supported. The maximum token length is set to 1024, which means that some bios will be truncated and information may be lost. 

## DummyClassifier

In [None]:
# the dummyclassifier implementation is based on https://machinelearningmastery.com/naive-classifiers-imbalanced-classification-metrics/
def eval_models(X, y, name, model):
    '''
    Function to evaluate the different baseline models.
    The cross_val_score function cannot be used because stratification only
    works with single-label cases. In this multi-label case I have to stratify
    the data manually based on the environmental label. A combination of the label
    cannot be used because then the least populated case that is both social and
    environmental [1,1] appears only twice in the dataset
    Args: X = str; biographies
          y = int; labels
          name = str; name of the model
          model = object; model
    Returns: social_score, environmental_score = float
    '''

    social_score = []
    environmental_score = []

    # getting the start time
    start_time = datetime.now()

    # instantiate the Kfold object
    cross_val = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=42)
    for train_index, val_index in cross_val.split(X, y['environmental']):
        model.fit(X.iloc[train_index], y.iloc[train_index])
        predictions = model.predict(X.iloc[val_index])
        scores = f1_score(y.iloc[val_index], predictions, average=None)
        social_score.append(scores[0])
        environmental_score.append(scores[1])

    # get final time
    end_time = datetime.now()
    print('Total running time of', name, (end_time - start_time).total_seconds())   

    return social_score, environmental_score


In [None]:
def get_models():
    '''
    Function to instantiate different Dummy Classifier models
    Args: None
    Returns: 
    '''
    models, names = [], []
    # Uniform implementation
    models.append(DummyClassifier(strategy='uniform', random_state=42))
    names.append('Uniform random guess')
    # Stratified implementation
    models.append(DummyClassifier(strategy='stratified', random_state=42))
    names.append('Stratified')
    # Constant implementation
    models.append(DummyClassifier(strategy='constant', constant=[0,0], random_state=42))
    names.append('Constant')

    return models, names


In [None]:
# prepare the data
X = all_rev_df['bio']
y = all_rev_df[['social', 'environmental']]



In [None]:
# evaluate the different naive baseline models
models, names = get_models()

for name, model in zip(names, models):
    social_score, environmental_score = eval_models(X, y, name, model)
    print('Baseline type:', name, 'Social:', 'Average f1 score:', np.mean(social_score), 'Standard deviation:', np.std(social_score))
    print('Baseline type:', name, 'Environmental:', 'Average f1 score:', np.mean(environmental_score), 'Standard deviation:', np.std(environmental_score))


Total running time of Uniform random guess 0.05807
Baseline type: Uniform random guess Social: Average f1 score: 0.42531900327758426 Standard deviation: 0.10814917264561477
Baseline type: Uniform random guess Environmental: Average f1 score: 0.17941176470588233 Standard deviation: 0.08208991609972273
Total running time of Stratified 0.06031
Baseline type: Stratified Social: Average f1 score: 0.32163636001970264 Standard deviation: 0.08865527222440235
Baseline type: Stratified Environmental: Average f1 score: 0.11111111111111112 Standard deviation: 0.11111111111111112
Total running time of Constant 0.062293
Baseline type: Constant Social: Average f1 score: 0.0 Standard deviation: 0.0
Baseline type: Constant Environmental: Average f1 score: 0.0 Standard deviation: 0.0


## Zero-shot Classification

In [None]:
# instantiate classifier
zero_shot_classifier = pipeline('zero-shot-classification', device=0) # to utilize GPU


Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartModel: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification m

In [None]:
# specify labels
candidate_labels = ['social', 'environmental', 'general']
# to increase the performance of the zero shot learning, we can add a hypothesis_template
# which gives a hint to the model to understand what broad category to look for
# hypothesis_template = 'This person has career experience in {} areas.'
hypothesis_template = 'This example is {}.'
# default hypothesis is 'This example is {class_name}.'


In [None]:
# iterate through the cross_val datasets and predict and evaluate the bios
social_pred_list = []
environmental_pred_list = []
social_score_list = []
environmental_score_list = []

# getting the start time
start_time = datetime.now()

# instantiate the Kfold object
cross_val = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
# convert the two labels into one to be able to use stratified Kfold
for train_index, val_index in cross_val.split(X, y['environmental']):

    social_pred_list = []
    environmental_pred_list = []

    for i, text in enumerate(X.iloc[val_index]):
        predictions = zero_shot_classifier(text, candidate_labels, hypothesis_template=hypothesis_template, multi_class=True)
        predictions = predictions['scores'][:2]
        predictions = [1 if x > 0.5 else 0 for x in predictions]
        social_pred_list.append(predictions[0])
        environmental_pred_list.append(predictions[1])

    social_score_list.append(f1_score(y.iloc[val_index]['social'], social_pred_list))
    environmental_score_list.append(f1_score(y.iloc[val_index]['environmental'], environmental_pred_list))

# get final time
end_time = datetime.now()
print('Total running time of zero-shot classification', (end_time - start_time).total_seconds())

print('Social f1 score:', np.mean(social_score_list))
print('Environmental f1 score:', np.mean(environmental_score_list))


Total running time of zero-shot classification 193.534237
Social f1 score: 0.5339294185263196
Environmental f1 score: 0.21533872340124213


# Defining classes necessary for fine-tuning

I will first use the detaul max_token_len of Longformer which is 4096 to determine how long the longest tokenized bio actually is. I will use this maximum value as the max_token_len. This way I ensure that first, I am not going to cut off important information if the max_token_len is to short. And second, I will use less compute and RAM during the fine-tuning process if I don't unnecessarily use the default max_token_len.

In [None]:
# This will be an alternative to the BERT model to account for the length of the bios
# which go beyond BERTs max length of 512 tokens
model_name = 'allenai/longformer-base-4096'
num_labels = 2
# for longformer model
max_token_len = 4096 # this is the default token length

tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
# create class for custom dataset based on https://huggingface.co/transformers/custom_datasets.html and this https://www.youtube.com/watch?v=wG2J_MJEjSQ
class DirectorDataset(torch.utils.data.Dataset):
    '''
    Class to create a PyTorch Dataset which will be 
    necessary to pass the tokens to the fine-tuning trainer
    It will read in the data in dataframe format and then tokenize them
    '''
    def __init__(self, data: pd.DataFrame, tokenizer: AutoTokenizer, max_token_len):
        '''
        Instantiate the object
        '''
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __getitem__(self, idx: int):
        '''
        Structure the data into a dictionary containing input_ids, 
        attention_masks, and idx
        '''
        row = self.data.iloc[idx]
        feature = row.bio
        labels = row[['social', 'environmental']]

        encoding = self.tokenizer.encode_plus(
            feature,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return dict(
            feature=feature, 
            input_ids=encoding['input_ids'].flatten(),
            attention_mask=encoding['attention_mask'].flatten(),
            labels=torch.FloatTensor(labels)
        )

    def __len__(self):
        '''
        Show the length of the dataset
        '''
        return len(self.data)


In [None]:
# add placeholder columns for the labels to be able to use the DirectorDataset class
all_bios_df_unique['social'] = 0
all_bios_df_unique['environmental'] = 0

# tokenize the entire bio dataset with the default token length of 4096
total_set = DirectorDataset(all_bios_df_unique, tokenizer, max_token_len=max_token_len)

# create variable to save largest end_token
largest_end_token = 0
all_len = []

# iterate through all samples in the dataset and determine the first 0 in the 
# attention_mask which will represent the end of the tokenized biography because
# the 0s stand for padding added to each tokenized item
for item in total_set:
    end_token = item['attention_mask'].flatten().tolist().index(0)
    all_len.append(end_token)
    # check whether the end_token is longer than the previously longest bio
    if end_token > largest_end_token:
        largest_end_token = end_token
    else:
        pass

# print the end_token of the longest biography
print('Longest token is', largest_end_token)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Longest token is 1765


In [None]:
# check how many samples are longer than a certain threshold
max_token_len = 1700
all_len_df = pd.DataFrame(data=all_len, columns=['len'])
absolute_above = all_len_df[all_len_df['len'] > max_token_len].count()
print('%.2f' % (absolute_above / all_bios_df_unique.shape[0] * 100), '% are longer than', max_token_len, 'tokens')
print(absolute_above.values[0], ' biographies are longer than', max_token_len, 'tokens')


0.02 % are longer than 1700 tokens
1  biographies are longer than 1700 tokens
