## In this notebook, we joined each report's processed Item 7 text with their respective summary labels, in preparation for model training.

In [None]:
# mount Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import packages
import pandas as pd
import numpy as np
import json
import glob
import os.path
import re
#import openai
from tqdm import tqdm

In [None]:
# load processed data in HTML format, where all tables are identified
file_path = "/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/working2/"

file_list = glob.glob(os.path.join(file_path,'*.htm'))

# get file and label indices for join
def get_indices(path, kind):
  if kind == "report":
    x = re.sub("/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/working2/", "", path)
  elif kind == "label":
    x = re.sub("/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/summaries/", "", path)
  x = x.split("_")[0]
  return x

reports = []
for file in file_list:
  index = get_indices(file, "report")
  with open(file, 'r') as html_file:
    text = "\n".join(html_file.readlines())
    reports.append((index, text))

len(reports)

191

In [None]:
# get labels
label_path = "/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/summaries/"

label_list = glob.glob(os.path.join(label_path,'*'))

labels = [get_indices(l, "label") for l in label_list]
len(labels)

label_texts = [] 

for label in label_list:
  index = get_indices(label, "label")
  with open(label, 'r') as file:
    text = file.read()
    label_texts.append((index, text))

len(label_texts)

50

In [None]:
def item7_text_only(text):
  # remove tables identified earlier
  tmp = re.sub('<openai>[ 0-9]+<\/openai>', ' ', text)
  # keep only item 7
  tmp = re.sub("<div id='Item 7A'>[\S\s]+<\/div>", ' ', tmp)
  # break HTML by ending HTML tag i.e. </xxx>
  tmp = re.split('<\/[A-Za-z]+>', str(tmp))
  # replace starting HTML tag i.e. <xxx>
  tmp = [re.sub('<[^\<\>]+>', ' ', t) for t in tmp]
  # concat back into one string
  return re.sub('\s+',' ', ' '.join(tmp)).strip()

In [None]:
def cleanse(text):
  x = re.sub('\n', ' ', text)
  return x

In [None]:
cleaned_reports = [{'id': r[0], 'report': item7_text_only(r[1])} for r in reports]
# cleaned_reports = sorted(cleaned_reports, key = lambda x: len(x[1])*-1)
cleaned_labels = [{'id': l[0], 'label': cleanse(l[1])} for l in label_texts]

In [None]:
report_df = []
for i in cleaned_reports:
  report_df.append({"id":i['id'], "report_length": len(i['report'].split(" ")), "report": i['report'], "has_label": i['id'] in labels})
report_df = pd.DataFrame(report_df)
report_df.head()

Unnamed: 0,id,report_length,report,has_label
0,1001601,4617,Overview Following a review of its Bitcoin min...,True
1,1002517,12011,The following Management’s Discussion and Anal...,True
2,1013462,10884,"Business Ansys, a Delaware corporation formed ...",True
3,1013857,4251,"BUSINESS OVERVIEW We develop, market, license,...",False
4,1015739,1,,False


In [None]:
label_df = []
for i in cleaned_labels:
  label_df.append({"id":i['id'], "label_length": len(i['label'].split(" ")), "label": i['label']})
label_df = pd.DataFrame(label_df)
label_df.head()

Unnamed: 0,id,label_length,label
0,8670,1142,"AUTOMATIC DATA PROCESSING, INC. (“ADPI”) Auto..."
1,50471,907,"Park City Group, Inc. (“PCGI”) The Company is ..."
2,78749,756,"AGILYSYS, Inc. (“AI”) Agilysys has been a lead..."
3,317788,927,"Digital Turbine, Inc. (“DTI”) Digital Turbine,..."
4,320340,933,Intelligent Systems Corporation (“ISC”) ISC’s...


In [None]:
df = pd.merge(label_df, report_df, how = 'left', on = 'id')
df.head(50)

Unnamed: 0,id,label_length,label,report_length,report,has_label
0,8670,1142,"AUTOMATIC DATA PROCESSING, INC. (“ADPI”) Auto...",7231,"Tabular dollars are presented in millions, exc...",True
1,50471,907,"Park City Group, Inc. (“PCGI”) The Company is ...",3859,The following Management’s Discussion and Anal...,True
2,78749,756,"AGILYSYS, Inc. (“AI”) Agilysys has been a lead...",5664,In “Management’s Discussion and Analysis of Fi...,True
3,317788,927,"Digital Turbine, Inc. (“DTI”) Digital Turbine,...",12886,The following discussion should be read in con...,True
4,320340,933,Intelligent Systems Corporation (“ISC”) ISC’s...,3481,Executive Summary Our consolidated operations ...,True
5,713425,880,"American Software, Inc.. (“ASI”) ASI operates...",6742,The following discussion and analysis should b...,True
6,723531,670,"Paychex, Inc. (“PI”) PI is a leading human re...",6794,20 Fiscal 2021 Business Highlights Highlights ...,True
7,1810806,1471,Unity is the world’s leading platform for crea...,8898,Please read the following discussion and analy...,True
8,1806837,868,Vertex is a leading provider of enterprise tax...,14093,comprise 72.5% of our 2020 software subscripti...,True
9,1794515,1292,ZoomInfo is a leading go-to-market intelligenc...,17675,The following discussion and analysis of our f...,True


In [None]:
df.to_pickle(os.path.join(file_path, "item7_text3.pkl"))

In [None]:
df[["id", "report_length", "label_length"]][(df["has_label"] == True) & (df['report_length'] <= df['label_length'])]

Unnamed: 0,id,report_length,label_length
20,789019,439,1094
32,843006,1,628
