### Imports

In [None]:
import pandas as pd
import os
import requests
from time import sleep
import re
from bs4 import BeautifulSoup
from tqdm import tqdm

### Regex Function Defined 

In [2]:
def NEAR_regex(list_of_words,max_words_between=5,partial=False,cases_matter=False):
    '''
    Parameters
    ----------
    list_of_words : list
        A list of "words", each element is a string
        
        This program will return a regex that will look for times where word1 
        is near word2, or word2 is near word 1.
        
        It works with multiple words: You can see if words1 is near word2 or
        word3. 
        
    max_words_between : int, optional
        How many "words" are allowed between words in list_of_words. The default
        is 5, but you should consider this carefully.
        
        "words" in between are chunks of characters. "DON don don- don12 2454" 
        is 5 words.
        
        This will not allow matches if the words are separated by a newline 
        ("\n") character.
        
    partial : Boolean, optional
        If true, will accept longer words than you give. For example, if one 
        word in your list is "how", it will match to "howdy". Be careful in 
        choosing this based on your problem. Partial makes more sense with 
        longer words. 
        The default is True.
        
    cases_matter: Boolean, optional bt IMPORTANT
        If True, will return a regex string that will only catch cases where  
        words in the string have the same case as given as input to this 
        function. For example, if one word here is "Hi", then the regex 
        produced by this function will not catch "hi".
        
        If false, will return a regex string that will only work if all letters
        in search string are lowercase.
        
        The default is True.
     
        
    Warning / Feature
    -------
    This WILL NOT ... (missing documentation!)
    
        
    Unsure about speed
    -------
    I don't think this is a very "fast" function, but it should be robust. 
  
    
    Suggested use
    -------
    a_string_you_have = 'Jack and Jill went up the hill'
    
    # 1. define words and set up the regex
    words = ['jack','hill']                         
    rgx = NEAR_regex(words)                       
    
    # 2. convert the string to lowercase before searching!
    a_string_you_have = a_string_you_have.lower()   
    
    # 3. len+findall+rgx = counts the number of times the word groups are close
    count = len(re.findall(rgx,test))              
    print(count)                                 
    
    Returns
    -------
    A string which is a regex that can be used to look for cases where all the 
    input words are near each other.
    '''
               
    from itertools import permutations
    
    start = r'(?:\b' # the r means "raw" as in the backslash is just a backslash, not an escape character
    
    if partial:
        gap   = r'[A-Za-z]*\b(?: +[^ \n\r]*){0,' +str(max_words_between)+r'} *\b'
        end   = r'[A-Za-z]*\b)'
    else:
        gap   = r'\b(?: +[^ \n]*){0,' +str(max_words_between)+r'} *\b'
        end   = r'\b)'
        
    regex_list = []
    
    for permu in list(permutations(list_of_words)):
        # catch this permutation: start + word + gap (+ word + gap)... + end
        if cases_matter: # case sensitive - what cases the user gives are given back
              regex_list.append(start+gap.join(permu)+end)           
        else: # the resulting search will only work if all words are lowercase
            lowerpermu = [w.lower() for w in permu]
            regex_list.append(start+gap.join(lowerpermu)+end)
    
    return '|'.join(regex_list)

### Read in the csv located in the inputs folder

In [3]:
sp500 = pd.read_csv('inputs/sp500_with_url.csv')

### RISKS

supply chain risk version 1
- large gap between words (10,000)
- result: good results

supply chain risk version 2
- small gap between words (1,000)
- more topic words
- result: great results!

supply chain risk version 3
- large gap between words (10,000)
- unhelpful topic and risk words 
- result: many innacurate results

In [4]:


for index, row in tqdm(sp500.iterrows(),total=len(sp500)): #loops through dataframe
    location = 'text_files/' + row['Symbol'] + '.html' #pull downloaded files (wiki firm page)
    if not os.path.isfile(location): #if file doesn't have a location, skip over it
        continue #if it doesn't have a location, continue to next item in for loop
    with open(location, 'r') as file: #open the file 
        html = file.read() #ready through the file 
         
    lower = BeautifulSoup(html).get_text().lower() #scan through html file read from above 
    no_punc = re.sub(r'\W',' ',lower) #letters and num
    cleaned = re.sub(r'\s+',' ',no_punc).strip() #removes spaces

    
    
    #RISK1: SUPPLY CHAIN (1st measure - avg)
    supply_chain_risk_words = ['(supply chain|supply|production|materials|capacity|inventory)', # list the topic words
                           '(risk|risks|concern|concerns)'] # list of "risk" words
    supply_chain_risk_rgx = NEAR_regex(supply_chain_risk_words, max_words_between=10000,partial=True) # creates the re pattern
    RISK_supply_chain = len(re.findall(supply_chain_risk_rgx,cleaned)) # look for that pattern
    sp500.loc[index,'RISK1: supply chain'] = RISK_supply_chain #add risk var to only ONE row
    #print(index, RISK_supply_chain, row['url']) #lmk how many matches per symbol (way to self check) (#hits, matched to each row)
  
    #RISK1: SUPPLY CHAIN (2nd measure - good)
    supply_chain_risk_words_2 = ['(supply chain|supply|production|materials|capacity|inventory|product|products)', # list the topic words
                           '(risk|risks|bad|cautious|worry|concern|concerns)'] # list of "risk" words
    supply_chain_risk_rgx_2 = NEAR_regex(supply_chain_risk_words_2, max_words_between=1000) # creates the re pattern
    RISK_supply_chain_2 = len(re.findall(supply_chain_risk_rgx_2,cleaned)) # look fir that pattern
    #print(RISK_supply_chain)
    sp500.loc[index,'RISK1: supply chain (2)'] = RISK_supply_chain_2 #add risk var to only ONE row

   
    #RISK1: SUPPLY CHAIN (3rd measure - bad)
    supply_chain_risk_words_3 = ['(supply chain|supply|production|materials|has|and|it)', # list the topic words
                           '(risk|risks|concern|concerns|bad|cautious|worry|found|in|at)'] # list of "risk" words
    supply_chain_risk_rgx_3 = NEAR_regex(supply_chain_risk_words_3, max_words_between=10000) # creates the re pattern
    RISK_supply_chain_3 = len(re.findall(supply_chain_risk_rgx_3,cleaned)) # look fir that pattern
    sp500.loc[index,'RISK1: supply chain (3)'] = RISK_supply_chain_3 #add risk var to only ONE row


    
    #RISK2: LITIGATION 
    litigation_risk_words = ['(litigation|law|legal|lawsuit|lawsuits|class action|sue|sues|fine|fines|fined|pending)', 
                           '(risk|risks|concern|concerns)'] 
    litigation_risk_rgx = NEAR_regex(litigation_risk_words, max_words_between=1000,partial=True) 
    RISK_litigation = len(re.findall(litigation_risk_rgx,cleaned)) 
    sp500.loc[index,'RISK2: Litigation'] = RISK_litigation 
    #print(index, RISK_litigation, row['url']) 

    
    #RISK3: INFLATION
    inflation_risk_words = ['(inflation|inflationary|economy|hyperinflation|deflation|inflate|inflated|dollar|USD)', 
                           '(risk|risks|concern|concerns)'] 
    inflation_risk_rgx = NEAR_regex(inflation_risk_words, max_words_between=1000, partial=True) 
    RISK_inflation = len(re.findall(inflation_risk_rgx,cleaned)) 
    sp500.loc[index,'RISK3: Inflation'] = RISK_inflation 
    #print(index, RISK_inflation, row['url']) 

    
#sp500.head(10)


100%|██████████| 505/505 [01:18<00:00,  6.44it/s]


### Summary Statistics

In [10]:
sp500.describe()

Unnamed: 0,CIK,RISK1: supply chain,RISK1: supply chain (2),RISK1: supply chain (3),RISK2: Litigation,RISK3: Inflation
count,505.0,504.0,504.0,504.0,504.0,504.0
mean,773118.8,0.339286,0.704365,1.123016,0.670635,0.396825
std,550264.2,0.57271,1.288387,0.423873,1.313128,0.795908
min,1800.0,0.0,0.0,1.0,0.0,0.0
25%,93751.0,0.0,0.0,1.0,0.0,0.0
50%,875320.0,0.0,0.0,1.0,0.0,0.0
75%,1132979.0,1.0,1.0,1.0,1.0,1.0
max,1792044.0,4.0,11.0,5.0,10.0,6.0


Necessary steps:

- read in  data file 
- merge it with the the dataframe that includes our new risk variables

In [15]:
#add 2019 accounting data
url='https://github.com/LeDataSciFi/ledatascifi-2021/blob/main/data/2019%20ccm_cleaned.dta?raw=true'

acct_2019 = pd.read_stata(url) #reads in the data file 

#merge with sp500 dataset 
sp500_with_acct = pd.merge(sp500,acct_2019, left_on='Symbol', right_on='tic', how='left', indicator=True, validate='one_to_one')

Necessary steps:
- create output folder
- save merged dataset to a csv within the output folder 

In [17]:
os.makedirs('output',exist_ok=True) #create output folder
sp500_with_acct.to_csv('output/sp500_accting_plus_textrisks.csv',index=False) #take merged df, remove index, put in csv 