In [2]:
# import our libraries
from fake_useragent import UserAgent
import requests
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import datetime as dt
import sys
import PyPDF2
import io
import os
import re
import openpyxl

# Base URL
base = r"https://www.sec.gov"

In [50]:
# Search SEC Edgar according to cik, doc_type, and date
def edgar_search_filings(cik, doc, dateb):
    
    # Create a user agent
    ua = UserAgent()
    chrome_agent = {'User-Agent':str(ua.chrome)}
    
    # Base URL
    base = r"https://www.sec.gov"

    # SEC Edgar URL
    edgar = r"/cgi-bin/browse-edgar"

    # Search Parameters
    srch_params = {'action':'getcompany',
                  'CIK':cik,
                  'type':doc,
                  'dateb':dateb,
                  'owner':'exclude',
                  'start':'',
                  'output':'',
                  'count':'100'}

    # Request URL
    response = requests.get(url = base + edgar, params = srch_params, headers=chrome_agent)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

In [51]:
# Get the filing urls
def get_filing_urls(soup):
    
    # Base URL
    base = r"https://www.sec.gov"
    
    # For each url in the table, add it to a list of the urls
    table = soup.find('table', class_='tableFile2')
    documents = table.find_all('a', id='documentsbutton')
    
    urls_to_filings = []
    for document in documents:
        urls_to_filings.append(base+document['href'])
        
    return urls_to_filings

In [52]:
# Basic function to search SEC given a url, and return a BS$ object
def sec_search(url):
    
    # Create a user agent
    ua = UserAgent()
    chrome_agent = {'User-Agent':str(ua.chrome)}
    
    # Return the BS4 from the url
    response = requests.get(url = url, headers=chrome_agent)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [53]:
# Function to get the URL of the document we are searching for
def get_doc(soup, doc):

    # Base URL
    base = r"https://www.sec.gov"
    
    # Boolean to make sure the link exists
    exist = False
    
    # For each row in the data table
    for row in soup.find('table', class_='tableFile', summary='Document Format Files').find_all('tr')[1:]:
        
        # get the doc, doc type and url link
        data = row.find_all('td')
        doc_type = data[3].get_text()
        link = data[2].find('a')['href']
        
        # If the doc is the type we are looking for, return the link
        if doc.lower() in doc_type.lower():
            filing_link = base + link
            exist = True
    
    # If the doc does not exist, return none
    if not exist:
        print("assigning none")
        filing_link = None
    
    return filing_link

In [54]:
# Function to get the date associated with the document filing
def get_date(soup):
    date = soup.find_all('div', class_='formGrouping')[1].find('div', class_='info').get_text()
    return pd.to_datetime(date)

In [55]:
# Function that scrapes all documents associated with a company, and the dates associated with those documents
def company_filing_search(cik, doc, dateb):
    
    # Search edgar and get the document URLs
    edgar_soup = edgar_search_filings(cik, doc, dateb)
    filing_urls = get_filing_urls(edgar_soup)
    
    documents = []
    dates = []
    
    # For each url
    for url in filing_urls:
        
        # Get the BS$ from that URL
        sec_soup = sec_search(url)
        
        # Get the document from that URL
        document = get_doc(sec_soup, doc)
        
        # IF the document doesn't exist continue to next url
        if document == None:
            continue
        
        # If the document is an interactive link make it non interactive
        if "ix?doc=" in document:
            document = document.replace("ix?doc=/","")
        
        # append the document to a list of documents
        documents.append(document)
        
        # append the date associated with that document
        date = get_date(sec_soup)
        dates.append(date)
        
    return documents, dates
        

In [56]:
# From notre dame study
# code they give to load and use their dictionary


def load_masterdictionary(file_path, print_flag=False, f_log=None, get_other=False):
    start_local = dt.datetime.now()
    # Setup dictionaries
    _master_dictionary = {}
    _sentiment_categories = ['negative', 'positive', 'uncertainty', 'litigious', 
                             'strong_modal', 'weak_modal', 'constraining']
    _sentiment_dictionaries = dict()
    for sentiment in _sentiment_categories:
        _sentiment_dictionaries[sentiment] = dict()
   
    # Load slightly modified common stopwords. 
    # Dropped from traditional: A, I, S, T, DON, WILL, AGAINST
    # Added: AMONG
    _stopwords = ['ME', 'MY', 'MYSELF', 'WE', 'OUR', 'OURS', 'OURSELVES', 'YOU', 'YOUR', 'YOURS',
                  'YOURSELF', 'YOURSELVES', 'HE', 'HIM', 'HIS', 'HIMSELF', 'SHE', 'HER', 'HERS', 'HERSELF',
                  'IT', 'ITS', 'ITSELF', 'THEY', 'THEM', 'THEIR', 'THEIRS', 'THEMSELVES', 'WHAT', 'WHICH',
                  'WHO', 'WHOM', 'THIS', 'THAT', 'THESE', 'THOSE', 'AM', 'IS', 'ARE', 'WAS', 'WERE', 'BE',
                  'BEEN', 'BEING', 'HAVE', 'HAS', 'HAD', 'HAVING', 'DO', 'DOES', 'DID', 'DOING', 'AN',
                  'THE', 'AND', 'BUT', 'IF', 'OR', 'BECAUSE', 'AS', 'UNTIL', 'WHILE', 'OF', 'AT', 'BY',
                  'FOR', 'WITH', 'ABOUT', 'BETWEEN', 'INTO', 'THROUGH', 'DURING', 'BEFORE',
                  'AFTER', 'ABOVE', 'BELOW', 'TO', 'FROM', 'UP', 'DOWN', 'IN', 'OUT', 'ON', 'OFF', 'OVER',
                  'UNDER', 'AGAIN', 'FURTHER', 'THEN', 'ONCE', 'HERE', 'THERE', 'WHEN', 'WHERE', 'WHY',
                  'HOW', 'ALL', 'ANY', 'BOTH', 'EACH', 'FEW', 'MORE', 'MOST', 'OTHER', 'SOME', 'SUCH',
                  'NO', 'NOR', 'NOT', 'ONLY', 'OWN', 'SAME', 'SO', 'THAN', 'TOO', 'VERY', 'CAN',
                  'JUST', 'SHOULD', 'NOW', 'AMONG']

    # Loop thru words and load dictionaries
    with open(file_path) as f:
        _total_documents = 0
        _md_header = f.readline()  # Consume header line
        print()
        for line in f:
            cols = line.rstrip('\n').split(',')
            word = cols[0]
            _master_dictionary[word] = MasterDictionary(cols, _stopwords)
            for sentiment in _sentiment_categories:
                if getattr(_master_dictionary[word], sentiment):
                    _sentiment_dictionaries[sentiment][word] = 0
            _total_documents += _master_dictionary[cols[0]].doc_count
            if len(_master_dictionary) % 5000 == 0 and print_flag:
                print(f'\r ...Loading Master Dictionary {len(_master_dictionary):,}', end='', flush=True)

    if print_flag:
        print('\r', end='')  # clear line
        print(f'\nMaster Dictionary loaded from file:\n  {file_path}\n')
        print(f'  master_dictionary has {len(_master_dictionary):,} words.\n')

    if f_log:
        try:
            f_log.write('\n\n  FUNCTION: load_masterdictionary' +
                        '(file_path, print_flag, f_log, get_other)\n')
            f_log.write(f'\n    file_path  = {file_path}')
            f_log.write(f'\n    print_flag = {print_flag}')
            f_log.write(f'\n    f_log      = {f_log.name}')
            f_log.write(f'\n    get_other  = {get_other}')
            f_log.write(f'\n\n    {len(_master_dictionary):,} words loaded in master_dictionary.\n')
            f_log.write(f'\n    Sentiment:')
            for sentiment in _sentiment_categories:
                f_log.write(f'\n      {sentiment:13}: {len(_sentiment_dictionaries[sentiment]):8,}')
            f_log.write(f'\n\n  END FUNCTION: load_masterdictionary: {(dt.datetime.now()-start_local)}')
        except Exception as e:
            print('Log file in load_masterdictionary is not available for writing')
            print(f'Error = {e}')

    if get_other:
        return _master_dictionary, _md_header, _sentiment_categories, _sentiment_dictionaries, _stopwords, _total_documents
    else:
        return _master_dictionary


class MasterDictionary:
    def __init__(self, cols, _stopwords):
        for ptr, col in enumerate(cols):
            if col == '':
                cols[ptr] = '0'
        try:
            self.word = cols[0].upper()
            self.sequence_number = int(cols[1])    
            self.word_count = int(cols[2])
            self.word_proportion = float(cols[3])
            self.average_proportion = float(cols[4])
            self.std_dev_prop = float(cols[5])
            self.doc_count = int(cols[6])
            self.negative = int(cols[7])
            self.positive = int(cols[8])
            self.uncertainty = int(cols[9])
            self.litigious = int(cols[10])
            self.strong_modal = int(cols[11])
            self.weak_modal = int(cols[12])
            self.constraining = int(cols[13])
            self.syllables = int(cols[14])
            self.source = cols[15]
            if self.word in _stopwords:
                self.stopword = True
            else:
                self.stopword = False
        except:
            print('ERROR in class MasterDictionary')
            print(f'word = {cols[0]} : seqnum = {cols[1]}')
            quit()
        return

start = dt.datetime.now()
print(f'\n\n{start.strftime("%c")}\nPROGRAM NAME: {sys.argv[0]}\n')
f_log = open('D:\Temp\Load_MD_Logfile.txt', 'w')
md = (r'LoughranMcDonald_MasterDictionary_2021.csv')
master_dictionary, md_header, sentiment_categories, sentiment_dictionaries, stopwords, total_documents = \
    load_masterdictionary(md, True, f_log, True)
print(f'\n\nRuntime: {(dt.datetime.now()-start)}')
print(f'\nNormal termination.\n{dt.datetime.now().strftime("%c")}\n')



Wed Nov 22 13:23:28 2023
PROGRAM NAME: /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/ipykernel_launcher.py


 ...Loading Master Dictionary 85,000
Master Dictionary loaded from file:
  LoughranMcDonald_MasterDictionary_2021.csv

  master_dictionary has 86,531 words.



Runtime: 0:00:03.503234

Normal termination.
Wed Nov 22 13:23:32 2023



In [57]:
# Function that scrpaes text from a file, whether it is html, txt, or pdf
def get_text(file):
    file_type = file[-3:]
    
    # Get file
    ua = UserAgent()
    chrome_agent = {'User-Agent':str(ua.chrome)}
    response = requests.get(url=file, headers=chrome_agent)
    
    try:
        # If PDF
        if file_type == 'pdf':
            f = io.BytesIO(response.content)

            # Open and read PDF
            pdf_reader = PyPDF2.PdfReader(f)

            # Extract text from  PDF
            doc = ""
            for page in pdf_reader.pages:
                for sentence in page.extract_text().split('\n'):
                    doc += sentence + ' '

            # Close PDF
            pdf_file.close()
        
        # If HTML or TXT
        else:
            soup = BeautifulSoup(response.content, 'html.parser')
            doc = soup.get_text()
    
    except:
        doc = ""
    
    return doc

In [58]:
## Create word sets and counts
def create_sentiment_sets_counts():
    
    negative = set()
    positive = set()
    uncertainty = set()
    litigious = set()
    strong_modal = set()
    weak_modal = set()
    constraining = set()
    
    sentiment_sets = {
        "negative" : negative,
        "positive" : positive,
        "uncertainty" : uncertainty,
        "litigious" : litigious,
        "strong_modal" : strong_modal,
        "weak_modal" : weak_modal,
        "constraining" : constraining
    }

    sentiment_counts = {
        "negative" : 0,
        "positive" : 0,
        "uncertainty" : 0,
        "litigious" : 0,
        "strong_modal" : 0,
        "weak_modal" : 0,
        "constraining" : 0
    }

    for sentiment in sentiment_sets.keys():
        for word in sentiment_dictionaries[sentiment].keys():
            sentiment_sets[sentiment].add(word.lower())

    return sentiment_sets, sentiment_counts

In [59]:
# Function that gets sentiment scores
def get_sentiment_scores(filing, normalized=False):
    
    # Get the text from a filing
    text = get_text(filing)
    
    # Get the words from the text
    words = nltk.word_tokenize(text)
    
    # Create new sentiment sets and counts
    sentiment_sets, sentiment_counts = create_sentiment_sets_counts()
    
    # Increment count if it is in the sentiment set
    total_count = 0
    for word in words:
        total_count += 1
        for sentiment in sentiment_sets.keys():
            if word.lower() in sentiment_sets[sentiment]:
                sentiment_counts[sentiment] += 1
    
    # If the text is empty, set total count equal to 1
    if text == "":
        total_count = 1
    
    # If we want to normalize the score
    if normalized:
        for sentiment in sentiment_counts.keys():
            sentiment_counts[sentiment] = sentiment_counts[sentiment]/total_count

    return sentiment_counts

In [60]:
# Function that combines scraping and sentiment analysis
def company_reports_sentiment_analysis(cik, docs=['10-k', '10-q'], end_date='20231113'):
    
    # Create an empty df
    df_dict = {
        "date" : [],
        "link" : [],
        "negative" : [],
        "positive" : [],
        "uncertainty" : [],
        "litigious" : [],
        "strong_modal" : [],
        "weak_modal" : [],
        "constraining" : []
    }
    
    # For each doc type
    for doc in docs:
        
        # Scrape the documents and dates
        print("Scraping " + doc + "s")
        company_filings, filing_dates = company_filing_search(cik, doc, end_date)
        
        # Analyse the sentiment of each filing
        print("Analyzing " + doc + "s sentiment")
        for filing in company_filings:
            print(filing)
            norm_scores = get_sentiment_scores(filing)

            for key in list(df_dict.keys())[2:]:
                df_dict[key].append(norm_scores[key])
            
        # Update the DF
        df_dict['date'] += filing_dates
        df_dict['link'] += company_filings

    return pd.DataFrame(df_dict)

In [61]:
# Run the Sentiment Analysis for the 30 Companies in the Dow

# Load the CIKs of each company in the Dow into a dictionary
dow_ciks = pd.read_excel("Dow_ciks.xlsx", sheet_name='Sheet1')
cik_dict = dict(zip(list(dow_ciks['Name']), list(dow_ciks['CIK'])))

# Empty DF
company_df_dict = dict.copy(cik_dict)

# For each company
for company in list(cik_dict.keys()):
    
    # Analyze all documents for that company
    print('-'*100)
    print("Getting data for " + company)
    cik = str(cik_dict[company])
    df = company_reports_sentiment_analysis(cik, docs=['10-k', '10-q'])
    
    # Update the df
    df['Company'] = company
    df['CIK'] = cik
    
    # Update the dictionary of dfs
    company_df_dict[company] = df
    

----------------------------------------------------------------------------------------------------
Getting data for American Express Co
Scraping 10-ks
Analyzing 10-ks sentiment
https://www.sec.gov/Archives/edgar/data/4962/000000496223000006/axp-20221231.htm
https://www.sec.gov/Archives/edgar/data/4962/000000496222000008/axp-20211231.htm
https://www.sec.gov/Archives/edgar/data/4962/000000496221000013/axp-20201231.htm
https://www.sec.gov/Archives/edgar/data/4962/000000496220000030/axp-20191231.htm
https://www.sec.gov/Archives/edgar/data/4962/000000496219000018/axp201810k.htm
https://www.sec.gov/Archives/edgar/data/4962/000000496218000032/axp201710k.htm
https://www.sec.gov/Archives/edgar/data/4962/000119312517047588/d321397d10k.htm
https://www.sec.gov/Archives/edgar/data/4962/000119312516469798/d131774d10k.htm
https://www.sec.gov/Archives/edgar/data/4962/000119312515059931/d862737d10k.htm
https://www.sec.gov/Archives/edgar/data/4962/000119312514066777/d656045d10k.htm
https://www.sec.gov

https://www.sec.gov/Archives/edgar/data/4962/000000496200000037/
https://www.sec.gov/Archives/edgar/data/4962/000000496299000048/
https://www.sec.gov/Archives/edgar/data/4962/000000496299000034/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
https://www.sec.gov/Archives/edgar/data/4962/
-----------------------

https://www.sec.gov/Archives/edgar/data/318154/000110465906032813/a06-11194_110q.htm
https://www.sec.gov/Archives/edgar/data/318154/000095012905010807/v13945e10vq.htm
https://www.sec.gov/Archives/edgar/data/318154/000095012905007789/v11163e10vq.htm
https://www.sec.gov/Archives/edgar/data/318154/000095012905004666/v08553e10vq.htm
https://www.sec.gov/Archives/edgar/data/318154/000095012904008750/v03018e10vq.htm
https://www.sec.gov/Archives/edgar/data/318154/000095012904005694/v00588e10vq.htm
https://www.sec.gov/Archives/edgar/data/318154/000095012904002691/v98517e10vq.htm
https://www.sec.gov/Archives/edgar/data/318154/000119312503069500/d10q.htm
https://www.sec.gov/Archives/edgar/data/318154/000119312503026265/d10q.htm
https://www.sec.gov/Archives/edgar/data/318154/000089843003002755/d10q.htm
https://www.sec.gov/Archives/edgar/data/318154/000089843002003918/d10q.htm
https://www.sec.gov/Archives/edgar/data/318154/000089843002003028/d10q.htm
https://www.sec.gov/Archives/edgar/data/318154/0

https://www.sec.gov/Archives/edgar/data/320193/000119312511192493/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312511104388/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312511010144/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312510162840/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312510088957/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312510012085/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312509153165/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312509087629/d10qa.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312509085781/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312509009937/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312508156421/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312508097759/d10q.htm
https://www.sec.gov/Archives/edgar/data/320193/000119312508017426/d10q.htm
https://www.sec.gov/Arch

https://www.sec.gov/Archives/edgar/data/12927/000001292716000143/a201606jun3010-q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292716000113/a201603mar3110-q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292715000068/a201509sep3010-q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292715000055/a201506jun3010-q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292715000019/a201503mar3110q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292714000057/a201409sep3010q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292714000042/a201406jun3010q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292714000022/a201403mar3110q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292713000041/a201309sep3010q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292713000035/a201306jun3010q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292713000023/a201303mar3110q.htm
https://www.sec.gov/Archives/edgar/data/12927/000001292712000

https://www.sec.gov/Archives/edgar/data/18230/000095013199003526/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
https://www.sec.gov/Archives/edgar/data/18230/
Scraping 10-qs
Analyzing 10-qs sentiment
https://www.sec.gov/Archives/edgar/data/18230/000001823023000056/cat-20230930.htm
https://www.sec.gov/Archives/edgar/data/18230/000001823023000047/cat-20230630.htm
https://www.sec.gov/Archives/edgar/data/18230/000001823023000022/cat-20230331.htm
https://www.sec.gov/Archives/edgar/d

Object 0 0 not defined.


https://www.sec.gov/Archives/edgar/data/18230/000001823020000155/q1cat10q3312020finalwexhibit.pdf


Object 0 0 not defined.


https://www.sec.gov/Archives/edgar/data/18230/000001823019000281/q32019cat10qfinalwexhibits.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823019000224/q22019cat10qfinal.pdf


Object 0 0 not defined.


https://www.sec.gov/Archives/edgar/data/18230/000001823019000153/cat10q3312019_2.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823018000293/q32018cat10qfinal.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823018000229/q22018cat10qfinal.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823018000136/q12018cat10qfinal.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823017000305/q32017cat10qfinala01.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823017000245/cat_10qx6302017.htm
https://www.sec.gov/Archives/edgar/data/18230/000001823017000145/q12017cat10qfinal.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823016000715/cat10q93016.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823016000630/cat10q6302016.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823016000512/cat10q3312016.pdf
https://www.sec.gov/Archives/edgar/data/18230/000001823015000324/cat10q9302015.pdf
https://www.sec.gov/Archives/edgar/data/18230/0000018230150002

https://www.sec.gov/Archives/edgar/data/858877/000089161802004345/f84358e10vk.htm
https://www.sec.gov/Archives/edgar/data/858877/000109581101505065/f75710e10-k.txt
https://www.sec.gov/Archives/edgar/data/858877/000109581100003692/f65797e10-k.txt
https://www.sec.gov/Archives/edgar/data/858877/000089161800000470/
https://www.sec.gov/Archives/edgar/data/858877/000089161899004365/
https://www.sec.gov/Archives/edgar/data/858877/
https://www.sec.gov/Archives/edgar/data/858877/
https://www.sec.gov/Archives/edgar/data/858877/
https://www.sec.gov/Archives/edgar/data/858877/
Scraping 10-qs
Analyzing 10-qs sentiment
https://www.sec.gov/Archives/edgar/data/858877/000085887723000013/csco-20230429.htm
https://www.sec.gov/Archives/edgar/data/858877/000085887723000005/csco-20230128.htm
https://www.sec.gov/Archives/edgar/data/858877/000085887722000025/csco-20221029.htm
https://www.sec.gov/Archives/edgar/data/858877/000085887722000009/csco-20220430.htm
https://www.sec.gov/Archives/edgar/data/858877/0000

https://www.sec.gov/Archives/edgar/data/93410/000009341013000003/cvx-123112x10kdoc.htm
https://www.sec.gov/Archives/edgar/data/93410/000095012312002976/f60351e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000095012311017688/f56670e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000095012310016846/f54086e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000089161809000054/f50714e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000095013408005470/f37829a1e10vkza.htm
https://www.sec.gov/Archives/edgar/data/93410/000095013408003672/f37829e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000095014907000074/f27542e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000095014906000076/f16935e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000095013405004137/f04196e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000095013404003128/f96742e10vk.htm
https://www.sec.gov/Archives/edgar/data/93410/000095014903000567/f87939e10vk.htm
https://www.sec.go

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://www.sec.gov/Archives/edgar/data/93410/000009341000000006/
https://www.sec.gov/Archives/edgar/data/93410/
https://www.sec.gov/Archives/edgar/data/93410/
https://www.sec.gov/Archives/edgar/data/93410/
https://www.sec.gov/Archives/edgar/data/93410/
https://www.sec.gov/Archives/edgar/data/93410/
https://www.sec.gov/Archives/edgar/data/93410/
Scraping 10-qs
Analyzing 10-qs sentiment
https://www.sec.gov/Archives/edgar/data/93410/000009341023000088/cvx-20230930.htm
https://www.sec.gov/Archives/edgar/data/93410/000009341023000079/cvx-20230630.htm
https://www.sec.gov/Archives/edgar/data/93410/000009341023000066/cvx-20230331.htm
https://www.sec.gov/Archives/edgar/data/93410/000009341022000075/cvx-20220930.htm
https://www.sec.gov/Archives/edgar/data/93410/000009341022000066/cvx-20220630.htm
https://www.sec.gov/Archives/edgar/data/93410/000009341022000028/cvx-20220331.htm
https://www.sec.gov/Archives/edgar/data/93410/000009341022000026/cvx-20220331.htm
https://www.sec.gov/Archives/edgar/da

https://www.sec.gov/Archives/edgar/data/354950/000035495019000010/hd_10kx02032019.htm
https://www.sec.gov/Archives/edgar/data/354950/000035495018000019/hd_10k01282018.htm
https://www.sec.gov/Archives/edgar/data/354950/000035495017000005/hd-01292017x10xk.htm
https://www.sec.gov/Archives/edgar/data/354950/000035495016000060/hd-1312016x10xk.htm
https://www.sec.gov/Archives/edgar/data/354950/000035495015000008/hd-212015x10xk.htm
https://www.sec.gov/Archives/edgar/data/354950/000035495014000008/hd-222014x10xk.htm
https://www.sec.gov/Archives/edgar/data/354950/000035495013000008/hd-232013x10xk.htm
https://www.sec.gov/Archives/edgar/data/354950/000035495012000003/hd-1292012x10xk.htm
https://www.sec.gov/Archives/edgar/data/354950/000119312511076501/d10k.htm
https://www.sec.gov/Archives/edgar/data/354950/000119312510067178/d10k.htm
https://www.sec.gov/Archives/edgar/data/354950/000095014409002875/x17422e10vk.htm
https://www.sec.gov/Archives/edgar/data/354950/000104746908004077/a2183971z10-k.htm

https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
https://www.sec.gov/Archives/edgar/data/354950/
----------------------------------------------------------------------------------------------------
Getting data for Honeywell International Inc
Scraping 10-ks


KeyboardInterrupt: 

In [6]:
# Write the company sentiment data to a directory
folder_name = 'company_sentiment_count'
os.makedirs(folder_name, exist_ok=True)
folder_path = os.path.join(os.getcwd(), folder_name)

for company in company_df_dict.keys():
    df = company_df_dict[company]
    df['sentiment_score'] = (df['positive'] - df['negative'])/(df['positive'] + df['negative'])
    
    cleaned_company = re.sub(r'[^a-zA-Z0-9\s]', '', company)
    file_name = cleaned_company.replace(' ', '_')
    
    csv_file_path = os.path.join(folder_path, file_name+'.csv')
    df.to_csv(csv_file_path, index=False)

In [3]:
# Read in company sentiment data
folder_path = os.path.join(os.getcwd(), 'company_sentiment_count')
dow_ciks = pd.read_excel("Dow_ciks.xlsx", sheet_name='Sheet1')
companies = list(dow_ciks['Name'])

company_sentiment_dfs = {}
for company in companies:
    cleaned_company = re.sub(r'[^a-zA-Z0-9\s]', '', company)
    file_name = cleaned_company.replace(' ', '_')
    file_name += '.csv'
    
    df = pd.read_csv(folder_path+'/'+file_name)
    company_sentiment_dfs[company] = df