In [None]:
import pandas as pd
import time
import os
from rouge import Rouge
from bert_score import BERTScorer
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import pipeline, set_seed, GPT2Tokenizer
from torchsummary import summary
from transformers import BartTokenizer,BartModel, BartConfig, BartForConditionalGeneration
from transformers.modeling_outputs import Seq2SeqLMOutput
import bert_score
import nltk
import re
from nltk.tokenize import sent_tokenize
import warnings
import numpy as np

In [None]:
# Read the .pkl file into a pandas DataFrame
file_path = 'item7_text_v5.pkl'
df = pd.read_pickle(file_path)

In [None]:
def combine_interest_expense(row):
    """
    Combines the values of 'Interest expense' and 'Interest Expense' keys, and removes the 'Not Important' key if present.
    Args:
        row (dict): A dictionary containing the keys to be processed.
    Returns:
        dict: The updated dictionary with combined interest expense values and without the 'Not Important' key.
    """
    # Combine the values of the two interest expense keys
    combined_expense = row['Interest expense'] + row['Interest Expense']
    
    # Update the 'Interest Expense' key with the combined value and remove the 'Interest expense' key
    row['Interest Expense'] = str(combined_expense)
    del row['Interest expense']
    # Remove the 'Not Important' key, if present
    if 'Not Important' in row:
        del row['Not Important']
    
    return row

def combine_business_overview(row):
    """
    Combines the values of 'Business Overview' and 'Business overview' keys.
    Args:
        row (dict): A dictionary containing the keys to be processed.
    Returns:
        dict: The updated dictionary with combined business overview values.
    """
    # Combine the values of the two business overview keys
    combined_business = row['Business Overview'] + row['Business overview']
    
    # Update the 'Business Overview' key with the combined value and remove the 'Business overview' key
    row['Business Overview'] = str(combined_business)
    del row['Business overview']
    
    return row

def combine_operations(row):
    """
    Combines the values of 'Results of Operations' and 'Results of operations' keys.
    Args:
        row (dict): A dictionary containing the keys to be processed.
    Returns:
        dict: The updated dictionary with combined results of operations values.
    """
    # Combine the values of the two results of operations keys
    combined_operation = row['Results of Operations'] + row['Results of operations']
    
    # Update the 'Results of Operations' key with the combined value and remove the 'Results of operations' key
    row['Results of Operations'] = str(combined_operation)
    del row['Results of operations']
    
    return row


In [None]:
# Apply the custom functions to the DataFrame
filtered_df = df.copy()
filtered_df['report'] = filtered_df['report'].apply(combine_interest_expense)
filtered_df['report'] = filtered_df['report'].apply(combine_business_overview)
filtered_df['report'] = filtered_df['report'].apply(combine_operations)

In [None]:
# Count non-blank values for each key
non_blank_counts = {}
for index, row in filtered_df.iterrows():
    report = row['report']
    for key, value in report.items():
        if value.strip() != "":
            non_blank_counts[key] = non_blank_counts.get(key, 0) + 1

print("Counts of non-blank values for each key:")
for key, count in non_blank_counts.items():
    print(f"{key}: {count}")

# Create report output
filtered_df['report_output'] = ""
for index, row in filtered_df.iterrows():
    report_output = ""
    output = row['Output']
    for key, value in output.items():
        if value != "":
            report_output += key + ": " + value + "\n\n"
    filtered_df.at[index, 'report_output'] = report_output

filtered_df['output_length'] = filtered_df['report_output'].apply(lambda x: len(x.split()))

# Calculate Rouge and BERTScores
rouge = Rouge()
scorer = BERTScorer(lang="en")

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
bert_precision = []
bert_recall = []
bert_f1 = []

for idx, row in filtered_df.iterrows():
    label = row['label']
    report_output = row['report_output']

    rouge_scores = rouge.get_scores(report_output, label, avg=True)
    rouge1_scores.append(rouge_scores['rouge-1']['f'])
    rouge2_scores.append(rouge_scores['rouge-2']['f'])
    rougeL_scores.append(rouge_scores['rouge-l']['f'])

    P, R, F1 = scorer.score([report_output], [label])
    bert_precision.append(P.item())
    bert_recall.append(R.item())
    bert_f1.append(F1.item())

filtered_df['rouge1_final'] = rouge1_scores
filtered_df['rouge2_final'] = rouge2_scores
filtered_df['rougeL_final'] = rougeL_scores
filtered_df['bertscore_f1_final'] = bert_f1

In [None]:
# Remove underscore sentences
def remove_underscore_sentences(text):
    sentences = text.split('. ')
    cleaned_sentences = [sentence for sentence in sentences if not re.search(r'_+', sentence)]
    return '. '.join(cleaned_sentences)


keys = [
    'Business Overview',
    'Results of Operations',
    'Revenues',
    'Gross Profit Margin',
    'Operating Income',
    'Interest Expense',
    'Liquidity',
    'Debt'
]

# Clean the 'Output' column by removing underscore sentences
for index, row in filtered_df.iterrows():
    report = row['Output']
    for key in keys:
        if key in report:
            report[key] = remove_underscore_sentences(report[key])

# Overwrite the 'Output' column with the cleaned data
filtered_df['Output'] = filtered_df['Output'].apply(lambda report: {key: remove_underscore_sentences(report[key]) for key in keys if key in report})


In [None]:
# create a folder named "outputs"
if not os.path.exists("outputs"):
    os.makedirs("outputs")

# loop through each row in the dataframe
for index, row in new_filtered_df.iterrows():
    # create the filename for the text file
    filename = f"outputs/{row['id']}_final_model.txt"
    # open the file in write mode
    with open(filename, "w") as f:
        # write the report_output string to the file
        f.write(row['report_output'])

In [None]:
nltk.download('punkt')

In [None]:
def initial_model(model_checkpoint_name):
    # Load BART model
    bart_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
    bart_model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_name)
    return bart_tokenizer,bart_model

# Load the pretrained BART model and tokenizer
tokenizer, model = initial_model("facebook/bart-large-cnn")

In [None]:
def process_input(tokenizer, my_text):
    input_ids = tokenizer(
        my_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024  # Increased max_length to 2048
    )
    return input_ids

def generate_summary(input_ids, model):
    generated_tokens = model.generate(
        input_ids=input_ids['input_ids'],
        attention_mask=input_ids['attention_mask'],
        max_length=1024,  # Increased max_length to 2048
        no_repeat_ngram_size=2,
        num_beams=64
    )
    summary = tokenizer.decode(
        generated_tokens[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return summary


def split_text(text, lines_per_chunk=25):
    lines = text.split('. ')
    chunks = ['. '.join(lines[i:i + lines_per_chunk]) for i in range(0, len(lines), lines_per_chunk)]
    return chunks


def process_report_bartbaseline(report):

    output = {}
    selected_sections = [
        'Business Overview',
        'Results of Operations',
        'Revenues',
        'Gross Profit Margin',
        'Operating Income',
        'Interest Expense',
        'Liquidity',
        'Debt'
    ]

    for key in selected_sections:
        content = report.get(key, '')
        if content:  # Check if content is not empty
            chunks = split_text(content)
            summarized_chunks = []
            for chunk in chunks:
                input_ids = process_input(tokenizer, chunk)
                summarized_chunk = generate_summary(input_ids, model)
                summarized_chunks.append(summarized_chunk)
            output[key] = ' '.join(summarized_chunks)
        else:
            output[key] = ''  # Set output as empty if content is empty
    
    filtered_output = {key: output[key] for key in selected_sections}
    print(filtered_output)
    return filtered_output

In [None]:

# Apply the function to each row in the 'report' column and save the result in a new column 'bart_output'
new_filtered_df['output_baseline'] = new_filtered_df['report'].apply(process_report_bartbaseline)

In [None]:
new_filtered_df['output_baseline_report'] = ""

for index, row in new_filtered_df.iterrows():
    report_output = ""
    output = row['output_baseline']
    
    for key, value in output.items():
        if value != "":
            report_output += key + ": " + value + "\n\n"
            
    new_filtered_df.at[index, 'output_baseline_report'] = report_output

In [None]:
new_filtered_df['baseline_output_length'] = new_filtered_df['output_baseline_report'].apply(lambda x: len(x.split()))

In [None]:

# Assuming you have a DataFrame called new_filtered_df with columns 'label' and 'report_output'
rouge = Rouge()
scorer = BERTScorer(lang="en")

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
bert_f1 = []

for idx, row in new_filtered_df.iterrows():
    label = row['label']
    report_output = row['output_baseline_report']

    # Calculate Rouge scores
    rouge_scores = rouge.get_scores(report_output, label, avg=True)
    rouge1_scores.append(rouge_scores['rouge-1']['f'])
    rouge2_scores.append(rouge_scores['rouge-2']['f'])
    rougeL_scores.append(rouge_scores['rouge-l']['f'])

    # Calculate BERTScores
    P, R, F1 = scorer.score([report_output], [label])
    bert_precision.append(P.item())
    bert_recall.append(R.item())
    bert_f1.append(F1.item())

# Add the scores to the DataFrame
new_filtered_df['rouge1_baseline'] = rouge1_scores
new_filtered_df['rouge2_baseline'] = rouge2_scores
new_filtered_df['rougeL_baseline'] = rougeL_scores
new_filtered_df['bertscore_f1_baseline'] = bert_f1

In [None]:

# create a folder named "outputs"
if not os.path.exists("outputs"):
    os.makedirs("outputs")

# loop through each row in the dataframe
for index, row in new_filtered_df.iterrows():
    # create the filename for the text file
    filename = f"outputs/{row['id']}_baseline_model.txt"
    # open the file in write mode
    with open(filename, "w") as f:
        # write the report_output string to the file
        f.write(row['output_baseline_report'])

In [None]:
mean_scores = new_filtered_df[['rouge1_baseline', 'rouge2_baseline', 'rougeL_baseline', 'bertscore_f1_baseline']].mean()

print("Mean Scores:")
print("ROUGE-1: ", mean_scores['rouge1_baseline'])
print("ROUGE-2: ", mean_scores['rouge2_baseline'])
print("ROUGE-L: ", mean_scores['rougeL_baseline'])
print("BERTScore F1: ", mean_scores['bertscore_f1_baseline'])