In [48]:
import numpy as np
import pandas as pd

from time import gmtime, strftime
import time
import os
from tqdm import tqdm
import pickle
from scipy.stats import linregress
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

from utils.crawler import Scrape10K, Scrape10Q, ConvertHTML
from utils.preprocessing import get_texts, stop_words

In [52]:
def get_cik(ticker):
    """ Get the cik for the ticker specified by the input argument 
    Input:
        ticker(str): ticker of the company e.g. "FB"
    """
    return ticker_library[ticker_library.ticker == ticker].secfilings.values[0][-10:]

def get_ciks(tickers):
    ciks = []

    for ticker in tickers:
        ciks.append(get_cik(ticker))

    return ciks

In [5]:
sector = "Energy"
# sector = "Energy"

# score_type = "governanceScore"
score_type = "environmentScore"

In [6]:
df_esg_score = pd.read_excel("data/esg_score.xlsx", sheet_name = "data")

In [7]:
tickers = df_esg_score[df_esg_score["sector"] == sector]["Company"]

In [10]:
esgs = df_esg_score[df_esg_score["sector"] == sector][["Company", "socialScore", "governanceScore", "environmentScore"]]

In [11]:
esgs.head()

Unnamed: 0,Company,socialScore,governanceScore,environmentScore
1,COG,14.01,9.28,23.39
3,MRO,10.27,8.7,23.76
4,CVX,10.67,10.21,20.29
7,EOG,11.06,8.24,19.67
8,APA,8.88,7.96,21.98


In [12]:
print("Number of companies in Energy sector: {}".format(len(tickers)))

Number of companies in Energy sector: 20


In [13]:
ticker_library = pd.read_csv(os.path.join("data", "tickers.csv"))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
ciks = []
for ticker in tickers:    
    try:
        # for a given ticker, find its cik number through th ticker library
        ciks.append(ticker_library[ticker_library.ticker == ticker].secfilings.values[0][-10:])
    except:
        # if could not find cik, give it a empty cik
        ciks.append('')

# Analysis based on Energy sector

## Scrape Energy sector

In [8]:
browse_url_base_10k = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-K'
filing_url_base = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_base = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

browse_url_base_10q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-Q&count=1000'

# Initialize log file
# (log file name = the time we initiate scraping session)
t = strftime("%Y_%m_%d_%H_%M_%S", gmtime())
log_file_name = t + ".txt"
log_file_path = os.path.join("log", log_file_name)

with open(log_file_path, 'a') as log_file:
    log_file.close()

# Iterate over CIKs and scrape 10-Ks
for cik in tqdm(ciks):
    time.sleep(5)
    Scrape10K(browse_url_base=browse_url_base_10k, 
          filing_url_base=filing_url_base, 
          doc_url_base=doc_url_base, 
          cik=cik,
          log_file_name=log_file_path)

    time.sleep(5)
    Scrape10Q(browse_url_base = browse_url_base_10q, 
          filing_url_base = filing_url_base, 
          doc_url_base = doc_url_base, 
          cik = cik,
          log_file_name = log_file_path)
    

#return to the main menu

  0%|          | 0/20 [00:00<?, ?it/s]

Already scraped CIK 0000858470


  5%|▌         | 1/20 [00:10<03:10, 10.00s/it]

Already scraped CIK 0000858470
Already scraped CIK 0000101778


  5%|▌         | 1/20 [00:19<06:07, 19.35s/it]


KeyboardInterrupt: 

## Transform from HTML to txt

In [9]:
# For 10-Ks...
# -*- coding: utf-8 -*-
dir_10k = os.path.join("data", "10k")
dir_10q = os.path.join("data", "10q")

# Iterate over CIKs and clean HTML filings
for cik in tqdm(energy_cik):
    ConvertHTML(cik, dir_10k)

# For 10-Qs...

# Iterate over CIKs and clean HTML filings
for cik in tqdm(energy_cik):
    ConvertHTML(cik, dir_10q)

100%|██████████| 20/20 [12:47<00:00, 38.39s/it]
100%|██████████| 20/20 [13:43<00:00, 41.16s/it]


## Get the preprocessed texts

In [16]:
ret = get_texts(ciks, tickers)
docs = ret["docs"]

20it [00:08,  2.36it/s]


# Regression of E, S, G score on indicator function(occurence of word_i)

Company - word table: 1 if word occurs in a certain doc, else 0

In [11]:
energy_dictionary = list()
for ticker in energy_tickers.values:
    energy_dictionary.extend(counters[ticker].keys())

In [12]:
energy_dictionary = set(energy_dictionary)
print("Number of distinct words: {}".format(len(energy_dictionary)))

Number of distinct words: 35938


In [14]:
energy_dictionary.remove('')

In [15]:
word_count = defaultdict(int)
for word in energy_dictionary:
    cnt = 0
    for ticker in energy_tickers:
        cnt += counters[ticker].get(word, 0)
    word_count[word] += cnt
    
top_k_word = [k for k, v in sorted(word_count.items(), key=lambda item: item[1], reverse=True)][:5000]

In [17]:
df_ind = pd.DataFrame(columns=["social_beta", "governance_beta", "environment_beta"], index = top_k_word)


In [19]:
for typ in ["social", "governance", "environment"]:
    score = esg_energy["{}Score".format(typ)]
    slopes = []
    for word in top_k_word:
        inds = []
        for ticker in energy_tickers:
            if word in counters[ticker]:
                inds.append(1)
            else:
                inds.append(0)
        slope, intercept, *_ = linregress(inds, score)
        slopes.append(slope)
    df_ind["{}_beta".format(typ)] = slopes

  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


In [133]:
df_ind = df_ind.sort_values(by=["social_beta", "governance_beta", "environment_beta"], ascending=False)

In [136]:
df_ind.head()

Unnamed: 0,social_beta,governance_beta,environment_beta
cog,4.987368,2.422632,7.706842
dinge,4.987368,2.422632,7.706842
schroeder,4.987368,2.422632,7.706842
extraordinary,3.39451,1.61,3.300196
insignificant,3.39451,1.61,3.300196


In [None]:
df_ind.to_csv(os.path.join("data", "reg_result_ind.csv"))

In [22]:
with open("dict_lemmatize.txt", 'w') as f:
    f.write('\n'.join(list(energy_dictionary)))

# Regression of E, S, G score on tfidf score for each word in each doc

In [28]:
cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=1000)
word_count_vector = cv.fit_transform(docs)



In [29]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [30]:
# top 10000 frequent words
feature_names = cv.get_feature_names()

In [31]:
print("Number of words: {}".format(len(feature_names)))

Number of words: 1000


In [32]:
df_doc_word = pd.DataFrame(columns=feature_names, index=tickers)

for i, ticker in tqdm(enumerate(tickers)):
    tf_idf_vector = tfidf_transformer.transform(cv.transform([docs[i]]))
    
    coo_matrix = tf_idf_vector.tocoo()
    # coo_matrix: A sparse matrix in which coo_matrix.col stores word_idx, coo_matrix.data stores tfidf score
    
    tuples = zip(coo_matrix.col, coo_matrix.data)
    for word_idx, tfidf in tuples:
        df_doc_word.at[ticker, feature_names[word_idx]] = tfidf

20it [00:20,  1.00s/it]


In [33]:
df_doc_word = df_doc_word.fillna(0)

In [37]:
df_tfidf = pd.DataFrame(columns=["social_beta", "governance_beta", "environment_beta"], index=feature_names)

In [44]:
for typ in ["social", "governance", "environment"]:
    score = esgs["{}Score".format(typ)]
    slopes = []
    for word in feature_names:
        tfidfs = df_doc_word[word].values.astype(float)
        slope, intercept, *_ = linregress(tfidfs, score)
        slopes.append(slope)
    df_tfidf["{}_beta".format(typ)] = slopes

In [49]:
cols = df_tfidf.columns
alpha = 0.3

for col in cols: 
    betas = df_tfidf[col]
    score_type = col.split('_')[0]
    
    upper_score = np.quantile(betas, 1 - alpha)
    lower_score = np.quantile(betas, alpha)
    is_good = np.where(betas < lower_score, 1, 0) + np.where(betas > upper_score, -1, 0)
    
    df_tfidf["{}_isGood".format(score_type)] = is_good

In [50]:
df_tfidf

Unnamed: 0,social_beta,governance_beta,environment_beta,social_isGood,governance_isGood,environment_isGood
accrues,172.623327,140.958635,11.321078,-1,-1,0
accurate,192.255982,192.965112,966.213646,-1,-1,-1
acmp,-5.850305,-29.395448,-55.146124,0,1,0
acre,20.183284,40.249319,61.245858,0,0,0
ad,22.084285,54.240683,58.230568,0,-1,0
...,...,...,...,...,...,...
wrb,-5.450129,-6.907001,17.743960,0,0,0
xom,4.959642,4.898837,3.334635,0,0,0
xto,17.963178,16.899726,15.852558,0,0,0
yemen,121.401300,41.949405,164.956601,-1,0,-1


Use just the top 5000 words, same as the 5000 words in indicator mode

In [123]:
# word_to_drop = list(set(feature_names) - set(top_k_word))

In [131]:
df_tfidf = df_tfidf.sort_values(by=["social_beta", "governance_beta", "environment_beta"], ascending=False)

In [132]:
df_tfidf.drop(word_to_drop).to_csv(os.path.join("data", "reg_result_tfidf.csv"))

In [8]:
df_tfidf = pd.read_csv("data/reg_result_tfidf.csv", index_col=0)

In [10]:
# "governance_beta", "environment_beta"
df_tfidf.sort_values(by=["social_beta"], ascending=False)

Unnamed: 0,social_beta,governance_beta,environment_beta
slower,1894.911650,234.641118,-1537.506149
registeredcommon,1526.757749,1574.837128,3806.794715
typical,1399.729713,1070.139605,388.165570
prevented,1314.462624,1402.248340,-597.380044
contributory,1259.663568,450.586049,2730.406128
...,...,...,...
consultation,-936.066218,-1360.415248,-1630.210797
liquidating,-960.701299,-960.136485,-745.137718
distinct,-982.509796,-711.681103,-1652.224372
posting,-1046.251675,-644.540777,-1355.851198


## Model Wrapup

In [55]:
not_listed = ["BBWI"]
sectors = ['Consumer Cyclical', 'Energy', 'Industrials', 'Healthcare',
       'Basic Materials', 'Consumer Defensive', 'Utilities', 'Technology',
       'Financial Services', 'Communication Services', 'Real Estate']

In [61]:
sector = "Financial Services"
# sector = "Energy"

df_esg_score = pd.read_excel("data/esg_score.xlsx", sheet_name = "data")
esgs = df_esg_score[df_esg_score["sector"] == sector][["Company", "socialScore", "governanceScore", "environmentScore"]]

tickers = list(esgs["Company"])
for t in not_listed:
    if t in tickers:
        tickers.remove(t)
        esgs = esgs.drop(esgs[esgs["Company"] == t].index)

ticker_library = pd.read_csv(os.path.join("data", "tickers.csv"))

ciks = get_ciks(tickers)

ret = get_texts(ciks, tickers)
docs = ret["docs"]

cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=1000)
word_count_vector = cv.fit_transform(docs)

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

feature_names = cv.get_feature_names()

df_doc_word = pd.DataFrame(columns=feature_names, index=tickers)

for i, ticker in tqdm(enumerate(tickers)):
    tf_idf_vector = tfidf_transformer.transform(cv.transform([docs[i]]))
    
    coo_matrix = tf_idf_vector.tocoo()
    # coo_matrix: A sparse matrix in which coo_matrix.col stores word_idx, coo_matrix.data stores tfidf score
    
    tuples = zip(coo_matrix.col, coo_matrix.data)
    for word_idx, tfidf in tuples:
        df_doc_word.at[ticker, feature_names[word_idx]] = tfidf

df_doc_word = df_doc_word.fillna(0)

df_tfidf = pd.DataFrame(columns=["social_beta", "governance_beta", "environment_beta"], index=feature_names)

for typ in ["social", "governance", "environment"]:
    score = esgs["{}Score".format(typ)]
    slopes = []
    for word in feature_names:
        tfidfs = df_doc_word[word].values.astype(float)
        slope, intercept, *_ = linregress(tfidfs, score)
        slopes.append(slope)
    df_tfidf["{}_beta".format(typ)] = slopes

cols = df_tfidf.columns
alpha = 0.3

for col in cols: 
    betas = df_tfidf[col]
    score_type = col.split('_')[0]
    
    upper_score = np.quantile(betas, 1 - alpha)
    lower_score = np.quantile(betas, alpha)
    is_good = np.where(betas < lower_score, 1, 0) + np.where(betas > upper_score, -1, 0)
    
    df_tfidf["{}_isGood".format(score_type)] = is_good

df_tfidf.to_csv("data/tfidf_scores/{}.csv".format(sector))

3it [00:01,  2.15it/s]

Scraping CIK 0000019617


100%|██████████| 1/1 [00:08<00:00,  8.49s/it]


Scraping CIK 0000019617


100%|██████████| 1/1 [00:11<00:00, 11.34s/it]
13it [01:29,  3.07s/it]

Scraping CIK 0000914208


100%|██████████| 1/1 [00:06<00:00,  6.23s/it]


Scraping CIK 0000914208


100%|██████████| 1/1 [00:08<00:00,  8.45s/it]
30it [02:26,  1.20it/s]

Scraping CIK 0000759944


100%|██████████| 1/1 [00:06<00:00,  6.92s/it]


Scraping CIK 0000759944


100%|██████████| 1/1 [00:10<00:00, 10.30s/it]
35it [03:16,  4.63s/it]

Scraping CIK 0000320335


100%|██████████| 1/1 [00:06<00:00,  6.95s/it]


Scraping CIK 0000320335


100%|██████████| 1/1 [00:10<00:00, 10.37s/it]
37it [04:03, 12.65s/it]

Scraping CIK 0000899051


100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Scraping CIK 0000899051


100%|██████████| 1/1 [00:09<00:00,  9.19s/it]
38it [04:57, 24.74s/it]

Scraping CIK 0001126328


100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


Scraping CIK 0001126328


100%|██████████| 1/1 [00:08<00:00,  8.50s/it]
44it [06:02,  8.71s/it]

Scraping CIK 0001374310


100%|██████████| 1/1 [00:06<00:00,  6.99s/it]


Scraping CIK 0001374310


100%|██████████| 1/1 [00:09<00:00,  9.59s/it]
64it [07:12,  6.75s/it]
64it [00:19,  3.21it/s]


## Appendix

In [91]:
import shutil

In [None]:
for cik in energy_cik[1:]:
    pkl_path = os.path.join("data", "10k", cik, "pickle")
    shutil.rmtree(pkl_path)
    pkl_path = os.path.join("data", "10q", cik, "pickle")
    shutil.rmtree(pkl_path)