## In this notebook, we called GPT 3.5 to generate summaries for the 10K reports as a baseline for comparison. Below is the general process:
- Load in processed Item 7 data where all the tables were removed
- For each report, truncate it evenly for multiple GPT 3.5 API calls (since the API has maximum token limit for each call)
- Ask GPT 3.5 to generate a summary for each chunk 
- Concat the summaries together to form the summary for the Item 7 as a whole
- Calculate ROUGE scores and BERT score for GPT 3.5-generated summary comparing with the ground truth labels

In [None]:
# mount Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install openai
!pip install rouge
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.2-py3-none-any.whl (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.1 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 KB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting multidict<7.0,>=4.5
  Downloadi

In [None]:
# import packages
import pandas as pd
import numpy as np
import glob
import os.path
import re
import openai
from tqdm import tqdm
import math
from rouge import Rouge
from bert_score import score
import torch
import time
from openai.error import InvalidRequestError
import pickle

In [None]:
# load processed data
file_path = "/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/working2/"

df = pd.read_pickle(os.path.join(file_path,'item7_text2.pkl'))

df['label'] = df['label'].apply(lambda x: re.sub('\n', ' ', x))

# filter for the first 29 labels
labels_to_remove = ['823546','827876','1725579','829323','843006','849399']
df = df[~df['id'].isin(labels_to_remove)]

df

Unnamed: 0,id,label_length,label,report_length,report,has_label
0,8670,1142,"AUTOMATIC DATA PROCESSING, INC. (“ADPI”) Auto...",7231,"Tabular dollars are presented in millions, exc...",True
1,50471,907,"Park City Group, Inc. (“PCGI”) The Company is ...",3859,The following Management’s Discussion and Anal...,True
2,78749,756,"AGILYSYS, Inc. (“AI”) Agilysys has been a lead...",5664,In “Management’s Discussion and Analysis of Fi...,True
3,317788,927,"Digital Turbine, Inc. (“DTI”) Digital Turbine,...",12886,The following discussion should be read in con...,True
4,320340,933,Intelligent Systems Corporation (“ISC”) ISC’s...,3481,Executive Summary Our consolidated operations ...,True
5,713425,880,"American Software, Inc.. (“ASI”) ASI operates...",6742,The following discussion and analysis should b...,True
6,723531,670,"Paychex, Inc. (“PI”) PI is a leading human re...",6794,20 Fiscal 2021 Business Highlights Highlights ...,True
7,1810806,1471,Unity is the world’s leading platform for crea...,8898,Please read the following discussion and analy...,True
8,1806837,868,Vertex is a leading provider of enterprise tax...,14093,comprise 72.5% of our 2020 software subscripti...,True
9,1794515,1292,ZoomInfo is a leading go-to-market intelligenc...,17675,The following discussion and analysis of our f...,True


In [None]:
df['label_length'].describe()

count      29.000000
mean     1237.758621
std       372.097688
min       670.000000
25%       929.000000
50%      1186.000000
75%      1471.000000
max      2158.000000
Name: label_length, dtype: float64

In [None]:
# import private OpenAI API key
with open('/content/drive/MyDrive/openai.txt') as f:
    lines = f.readlines()
openai.api_key = lines[0]

In [None]:
def generate_summary_with_gpt3point5(text, max_num_tokens, token_ratio):
    num_input_tokens = math.ceil(len(text.split(' ')) * token_ratio)
    num_output_tokens = int(max_num_tokens - num_input_tokens)
    response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages = [{"role": "user", "content": f'Summarize the following text in no more than {num_output_tokens} tokens: "{text}"'}],
    max_tokens=num_output_tokens,
    n=1,
    stop=None,
    temperature=0.7,
    )
    output = response['choices'][0]['message']['content']
    num_output_tokens = math.ceil(len(output.split(' ')) * token_ratio)
    print(f'num_input_tokens: {num_input_tokens}, ' \
          + f'num_output_tokens: {num_output_tokens}, ' \
          f'total: {num_input_tokens + num_output_tokens}')
    # return response.choices[0].text
    return output

In [None]:
def truncate_text_for_gpt3point5(row):
  file_id = row['id']
  text = row['report']

  # number of words in ground truth labels
  output_length = row['label_length']
  # number of words in report
  input_length = row['report_length']
  # control length of generated summary per chunk
  # by the ratio between the input length and output length
  token_ratio = 1000.0 / 750
  input_token_length = math.ceil(input_length * token_ratio)
  label_token_length = math.ceil(output_length * token_ratio)

  summaries = []
  max_num_tokens = 4000
  done = False
  while not done:
    try:
      num_api_calls = math.ceil((input_token_length + label_token_length) / max_num_tokens)
      # truncate original text evenly for API calls
      split_word_indices = np.asarray(np.linspace(0, len(text.split(" ")), num_api_calls + 1), dtype = "int")
      
      print(f"Processing file {file_id}, input length: {input_length}, output length: {output_length}")

      summaries = []
      # call API chunk by chunk
      for i in range(len(split_word_indices) - 1):
        # print(f"-----Processing text chunk from word {split_word_indices[i]} to word {split_word_indices[i+1]}.")
        chunk_text = " ".join(text.split(" ")[split_word_indices[i]:split_word_indices[i+1]])
        chunk_summary = generate_summary_with_gpt3point5(chunk_text, max_num_tokens, token_ratio)
        # wait due to OpenAI rate limit
        summaries.append(chunk_summary)
        time.sleep(15)
      done = True
    except InvalidRequestError as e:
      if "This model's maximum context length" not in e.user_message:
        raise Exception(e)
        done = True
      else:
        max_num_tokens -= 100
        print(f'reduce max_num_tokens by 100: {max_num_tokens}')

  # return generated summaries of all chunks
  return " ".join(summaries)

In [None]:
summary_path = "/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/working2/gpt"
if not os.path.exists(summary_path):
  os.makedirs(summary_path)

In [None]:
for i, row in tqdm(df.iterrows()):
  output_name = f'{summary_path}/{row["id"]}.txt'
  if os.path.exists(output_name): continue
  summary = truncate_text_for_gpt3point5(row)
  with open(output_name, 'w') as f:
    f.write(summary)
  #break

0it [00:00, ?it/s]

Processing file 1794515, input length: 17675, output length: 1292
num_input_tokens: 3367, num_output_tokens: 252, total: 3619
num_input_tokens: 3367, num_output_tokens: 196, total: 3563
num_input_tokens: 3367, num_output_tokens: 180, total: 3547
num_input_tokens: 3367, num_output_tokens: 488, total: 3855
reduce max_num_tokens by 100: 3900
Processing file 1794515, input length: 17675, output length: 1292
num_input_tokens: 3367, num_output_tokens: 175, total: 3542
num_input_tokens: 3367, num_output_tokens: 142, total: 3509
num_input_tokens: 3367, num_output_tokens: 188, total: 3555
num_input_tokens: 3367, num_output_tokens: 164, total: 3531
num_input_tokens: 3367, num_output_tokens: 148, total: 3515
num_input_tokens: 3367, num_output_tokens: 214, total: 3581
num_input_tokens: 3367, num_output_tokens: 242, total: 3609


10it [04:32, 27.26s/it]

Processing file 1803696, input length: 12509, output length: 1676
num_input_tokens: 3335, num_output_tokens: 191, total: 3526
num_input_tokens: 3336, num_output_tokens: 359, total: 3695
reduce max_num_tokens by 100: 3900
Processing file 1803696, input length: 12509, output length: 1676
num_input_tokens: 3335, num_output_tokens: 252, total: 3587
num_input_tokens: 3336, num_output_tokens: 486, total: 3822
num_input_tokens: 3336, num_output_tokens: 178, total: 3514
num_input_tokens: 3336, num_output_tokens: 176, total: 3512
num_input_tokens: 3336, num_output_tokens: 218, total: 3554


11it [07:45, 47.94s/it]

Processing file 1786352, input length: 9861, output length: 1232
num_input_tokens: 3287, num_output_tokens: 198, total: 3485
num_input_tokens: 3287, num_output_tokens: 247, total: 3534
num_input_tokens: 3287, num_output_tokens: 127, total: 3414
num_input_tokens: 3288, num_output_tokens: 150, total: 3438


12it [09:22, 55.35s/it]

Processing file 1773383, input length: 9128, output length: 1433
num_input_tokens: 3043, num_output_tokens: 256, total: 3299
num_input_tokens: 3043, num_output_tokens: 163, total: 3206
num_input_tokens: 3043, num_output_tokens: 247, total: 3290
num_input_tokens: 3043, num_output_tokens: 212, total: 3255


13it [11:05, 63.97s/it]

Processing file 1768267, input length: 14492, output length: 1275
num_input_tokens: 3220, num_output_tokens: 184, total: 3404
reduce max_num_tokens by 100: 3900
Processing file 1768267, input length: 14492, output length: 1275
num_input_tokens: 3220, num_output_tokens: 176, total: 3396
reduce max_num_tokens by 100: 3800
Processing file 1768267, input length: 14492, output length: 1275
num_input_tokens: 3220, num_output_tokens: 178, total: 3398
reduce max_num_tokens by 100: 3700
Processing file 1768267, input length: 14492, output length: 1275
num_input_tokens: 3220, num_output_tokens: 164, total: 3384
reduce max_num_tokens by 100: 3600
Processing file 1768267, input length: 14492, output length: 1275
num_input_tokens: 3220, num_output_tokens: 186, total: 3406
num_input_tokens: 3220, num_output_tokens: 162, total: 3382
num_input_tokens: 3222, num_output_tokens: 164, total: 3386
num_input_tokens: 3220, num_output_tokens: 219, total: 3439
num_input_tokens: 3220, num_output_tokens: 150, to

14it [15:13, 101.25s/it]

Processing file 1764925, input length: 10361, output length: 1911
num_input_tokens: 2763, num_output_tokens: 315, total: 3078
num_input_tokens: 2763, num_output_tokens: 152, total: 2915
num_input_tokens: 2763, num_output_tokens: 251, total: 3014
num_input_tokens: 2763, num_output_tokens: 203, total: 2966
num_input_tokens: 2764, num_output_tokens: 263, total: 3027


15it [17:25, 108.11s/it]

Processing file 1739942, input length: 12401, output length: 1534
num_input_tokens: 3307, num_output_tokens: 259, total: 3566
num_input_tokens: 3307, num_output_tokens: 122, total: 3429
num_input_tokens: 3307, num_output_tokens: 231, total: 3538
num_input_tokens: 3307, num_output_tokens: 346, total: 3653
num_input_tokens: 3308, num_output_tokens: 319, total: 3627


16it [19:41, 114.84s/it]

Processing file 1739936, input length: 7734, output length: 1562
num_input_tokens: 2578, num_output_tokens: 183, total: 2761
num_input_tokens: 2579, num_output_tokens: 382, total: 2961
num_input_tokens: 2578, num_output_tokens: 207, total: 2785
num_input_tokens: 2579, num_output_tokens: 190, total: 2769


17it [21:26, 112.21s/it]

Processing file 736012, input length: 3048, output length: 1076
num_input_tokens: 2032, num_output_tokens: 192, total: 2224
num_input_tokens: 2032, num_output_tokens: 284, total: 2316


18it [22:20, 96.62s/it] 

Processing file 746210, input length: 4762, output length: 929
reduce max_num_tokens by 100: 3900
Processing file 746210, input length: 4762, output length: 929
reduce max_num_tokens by 100: 3800
Processing file 746210, input length: 4762, output length: 929
reduce max_num_tokens by 100: 3700
Processing file 746210, input length: 4762, output length: 929
num_input_tokens: 2116, num_output_tokens: 154, total: 2270
num_input_tokens: 2116, num_output_tokens: 248, total: 2364
num_input_tokens: 2118, num_output_tokens: 270, total: 2388


19it [23:40, 91.94s/it]

Processing file 769397, input length: 10542, output length: 1065
num_input_tokens: 3514, num_output_tokens: 219, total: 3733
num_input_tokens: 3515, num_output_tokens: 290, total: 3805
num_input_tokens: 3514, num_output_tokens: 195, total: 3709
num_input_tokens: 3515, num_output_tokens: 348, total: 3863


20it [25:30, 97.18s/it]

Processing file 789019, input length: 439, output length: 1094
num_input_tokens: 586, num_output_tokens: 202, total: 788


21it [25:54, 76.14s/it]

Processing file 796343, input length: 8232, output length: 1195
num_input_tokens: 2744, num_output_tokens: 183, total: 2927
num_input_tokens: 2744, num_output_tokens: 120, total: 2864
num_input_tokens: 2744, num_output_tokens: 211, total: 2955
num_input_tokens: 2744, num_output_tokens: 194, total: 2938


22it [27:33, 82.62s/it]

Processing file 807863, input length: 10350, output length: 1186
num_input_tokens: 3450, num_output_tokens: 274, total: 3724
reduce max_num_tokens by 100: 3900
Processing file 807863, input length: 10350, output length: 1186
num_input_tokens: 3450, num_output_tokens: 302, total: 3752
reduce max_num_tokens by 100: 3800
Processing file 807863, input length: 10350, output length: 1186
num_input_tokens: 2760, num_output_tokens: 243, total: 3003
num_input_tokens: 2760, num_output_tokens: 134, total: 2894
num_input_tokens: 2760, num_output_tokens: 295, total: 3055
num_input_tokens: 2760, num_output_tokens: 420, total: 3180
num_input_tokens: 2760, num_output_tokens: 232, total: 2992


23it [30:57, 118.32s/it]

Processing file 807882, input length: 5128, output length: 1447
num_input_tokens: 2279, num_output_tokens: 288, total: 2567
num_input_tokens: 2279, num_output_tokens: 376, total: 2655
num_input_tokens: 2280, num_output_tokens: 191, total: 2471


24it [32:29, 110.62s/it]

Processing file 813672, input length: 6765, output length: 1632
num_input_tokens: 3007, num_output_tokens: 396, total: 3403
num_input_tokens: 3007, num_output_tokens: 206, total: 3213
num_input_tokens: 3007, num_output_tokens: 171, total: 3178


25it [33:49, 101.57s/it]

Processing file 814547, input length: 10527, output length: 2158
num_input_tokens: 2807, num_output_tokens: 195, total: 3002
num_input_tokens: 2807, num_output_tokens: 316, total: 3123
num_input_tokens: 2808, num_output_tokens: 194, total: 3002
num_input_tokens: 2807, num_output_tokens: 268, total: 3075
num_input_tokens: 2808, num_output_tokens: 342, total: 3150


26it [36:07, 112.51s/it]

Processing file 814549, input length: 5744, output length: 1826
num_input_tokens: 2552, num_output_tokens: 334, total: 2886
num_input_tokens: 2554, num_output_tokens: 590, total: 3144
reduce max_num_tokens by 100: 3900
Processing file 814549, input length: 5744, output length: 1826
num_input_tokens: 2552, num_output_tokens: 170, total: 2722
num_input_tokens: 2554, num_output_tokens: 604, total: 3158
num_input_tokens: 2554, num_output_tokens: 398, total: 2952


27it [39:19, 136.05s/it]

Processing file 816761, input length: 7356, output length: 1031
num_input_tokens: 3270, num_output_tokens: 323, total: 3593
num_input_tokens: 3270, num_output_tokens: 288, total: 3558
num_input_tokens: 3270, num_output_tokens: 212, total: 3482


28it [40:44, 121.04s/it]

Processing file 727634, input length: 5073, output length: 787
num_input_tokens: 3382, num_output_tokens: 204, total: 3586
num_input_tokens: 3383, num_output_tokens: 266, total: 3649


29it [41:38, 86.15s/it] 


In [None]:
# load in GPT generated summaries after creating them
summary_list = glob.glob(os.path.join(summary_path,'*.txt'))
len(summary_list)

29

In [None]:
summary_df = []

def get_id(x):
  x = re.sub("/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/working2/gpt/", "", x)
  x = x.split(".")[0]
  return x

for s in summary_list:
  with open(s, 'r') as f:
    tmp_summary = "\n".join(f.readlines())
    summary_df.append({"id": get_id(s), "gpt_summary": tmp_summary})

summary_df = pd.DataFrame(summary_df)
summary_df.head()

Unnamed: 0,id,gpt_summary
0,8670,"ADP, a leading global provider of cloud-based ..."
1,50471,This text provides management's discussion and...
2,78749,"Agilysys, a leader in hospitality software, pr..."
3,317788,"Digital Turbine, Inc. is a mobile content disc..."
4,320340,CoreCard Software provides technology solution...


In [None]:
df = pd.merge(df, summary_df, how = "left", on = "id")
df.head()

Unnamed: 0,id,label_length,label,report_length,report,has_label,gpt_summary
0,8670,1142,"AUTOMATIC DATA PROCESSING, INC. (“ADPI”) Auto...",7231,"Tabular dollars are presented in millions, exc...",True,"ADP, a leading global provider of cloud-based ..."
1,50471,907,"Park City Group, Inc. (“PCGI”) The Company is ...",3859,The following Management’s Discussion and Anal...,True,This text provides management's discussion and...
2,78749,756,"AGILYSYS, Inc. (“AI”) Agilysys has been a lead...",5664,In “Management’s Discussion and Analysis of Fi...,True,"Agilysys, a leader in hospitality software, pr..."
3,317788,927,"Digital Turbine, Inc. (“DTI”) Digital Turbine,...",12886,The following discussion should be read in con...,True,"Digital Turbine, Inc. is a mobile content disc..."
4,320340,933,Intelligent Systems Corporation (“ISC”) ISC’s...,3481,Executive Summary Our consolidated operations ...,True,CoreCard Software provides technology solution...


In [None]:
def calculate_metrics(ref_sentences, cand_sentences):
    # Load the models for BERTScore

    # Calculate the ROUGE scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(cand_sentences, ref_sentences, avg=True)

    # Calculate the BERTScore
    bertscore = get_bert_score(cand_sentences.tolist(), ref_sentences.tolist())

    #return rouge_scores[0]['rouge-1']['f'],rouge_scores[0]['rouge-2']['f'],rouge_scores[0]['rouge-l']['f'],bertscore
    return rouge_scores['rouge-1']['f'], rouge_scores['rouge-2']['f'], rouge_scores['rouge-l']['f'], bertscore

def get_bert_score(cands, refs):
    assert len(cands) == len(refs)
    P, R, F1 = score(cands, refs, lang='en')
    P = torch.mean(P, dim=0).item()
    R = torch.mean(R, dim=0).item()
    F1 = torch.mean(F1, dim=0).item()
    return F1

In [None]:
calculate_metrics(df['gpt_summary'], df['label'])

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(0.40758411306882947,
 0.20169378930291199,
 0.3852132938473419,
 0.8516900539398193)

In [None]:
df.to_pickle(os.path.join(summary_path, "gpt_summaries_df.pkl"))