In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
import ast

In [None]:
df = pd.read_parquet('/content/drive/MyDrive/DeepLearningProject/risk_matrix_clean_final.parquet')
df.shape

(488, 6)

In [None]:
df.head()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# there are a few rows where inappropriate text is found like '' or '.' wil have to remove those
df.tail()

Unnamed: 0,ticker,year,item1a,item7,narrative,risk_score
483,NBTB,2025,of this Form 10-K.,,Legal & Regulatory,1
484,PCG,2025,,,Legal & Regulatory,4
485,PFS,2025,.,,Legal & Regulatory,4
486,PPBI,2025,.,,Legal & Regulatory,4
487,WT,2025,”\nincluded in this Report.,,Legal & Regulatory,3


In [None]:
#cleaning ultra short text
def clean_short_or_empty(text):
    if isinstance(text, str):
        stripped = text.strip()
        # Remove if empty, only digits, or too short to be meaningful
        if stripped == "" or stripped.isdigit() or len(stripped) < 5:
            return np.nan
    return text

# Apply to cleaned text columns
df['item1a'] = df['item1a'].apply(clean_short_or_empty)
# df['item7']  = df['item7'].apply(clean_short_or_empty)

In [None]:
df.item1a.isna().sum()

np.int64(59)

In [None]:
df = df.replace({None: np.nan})

In [None]:
nan_values = df[(df.item1a.isna()) & (df.item7.isna())]

In [None]:
nan_values

Unnamed: 0,ticker,year,item1a,item7,narrative,risk_score
24,AGCO,2025,,,Legal & Regulatory,4
280,FCN,2025,,,Legal & Regulatory,4
377,HRMY,2025,,,Legal & Regulatory,4
433,MA,2025,,,Legal & Regulatory,4
474,MTSI,2024,,,Legal & Regulatory,4
484,PCG,2025,,,Legal & Regulatory,4
485,PFS,2025,,,Legal & Regulatory,4
486,PPBI,2025,,,Legal & Regulatory,4


In [None]:
#dropping all the rows which have both item1a and item7 as nan values
df.drop(nan_values.index, inplace=True)

In [None]:
#save the cleaned version as parquet
df.to_parquet('/content/drive/MyDrive/DeepLearningProject/risk_matrix_clean_final_cleaned.parquet', index=False)

In [None]:
df.shape

(480, 6)

In [None]:
# this shows the class imbalance in the dataset extracted
df.risk_score.value_counts()

Unnamed: 0_level_0,count
risk_score,Unnamed: 1_level_1
4,303
5,145
2,20
3,9
1,3


## Field Preprocessing

In [None]:
df = pd.read_parquet('/content/drive/MyDrive/DeepLearningProject/risk_matrix_clean_final_cleaned.parquet')

In [None]:
# combininf item1a and item 7 both the natural language heavy columns
df['text'] = df['item1a'] + " " + df['item7']

In [None]:
df_copy = df.copy()

In [None]:
df = df_copy.copy()

In [None]:
# Create label (0 to 4 for risk score 1 to 5)
df['label'] = df['risk_score'].astype(int) - 1

In [None]:
#narrative one-hot encoding
df['narrative'].head(10).tolist()

['Legal & Regulatory',
 'Financial Distress',
 'Financial Distress',
 'Financial Distress',
 'Legal & Regulatory',
 'Legal & Regulatory',
 'Legal & Regulatory',
 'Financial Distress',
 'Supply Chain',
 'Financial Distress']

In [None]:
df["text"] = df["item1a"].fillna("").str.cat(df["item7"].fillna(" "), sep=" ")
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(sparse_output=False)
narr_encoded = encoder.fit_transform(df[['narrative']])

# Create DataFrame from encoded values
narr_encoded_df = pd.DataFrame(narr_encoded, columns=encoder.get_feature_names_out(['narrative']))

# Add to original dataframe
df = pd.concat([df.reset_index(drop=True), narr_encoded_df.reset_index(drop=True)], axis=1)


In [None]:
#combining both the columns which contain natural language for further processing (item1a and item7)
df["text"] = df["item1a"].fillna("").str.cat(df["item7"].fillna(" "), sep=" ")

In [None]:
df.head()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
df.drop(['item1a', 'item7'], axis=1, inplace=True)

In [None]:
#storing the final preprocessed parquet
df.to_parquet('/content/drive/MyDrive/DeepLearningProject/risk_matrix_preprocessed.parquet', index=False)

## Text embedding using FinBERT

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
import ast
import os
from transformers import AutoTokenizer
from tqdm import tqdm
import torch
import gc

In [None]:
df = pd.read_parquet('/content/drive/MyDrive/DeepLearningProject/risk_matrix_preprocessed.parquet')

In [None]:
save_path = '/content/drive/MyDrive/DeepLearningProject/Fibert_Batches'
os.makedirs(save_path, exist_ok=True)

In [None]:
# Prepare tokenizer and directory
# processing in the batch of size 4 initially to optimize RAM utilization, because loading all the data in RAM was not possible at once
# batch wise text embeddings, attention masks and input_ids(ids for which attention masks are generated)

tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-pretrain")
texts = df["text"].fillna("").tolist()
batch_size = 4

# Tokenize in batches and save to disk
for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i + batch_size]
    encoded = tokenizer(
        batch_texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    # Save input_ids and attention_mask separately
    torch.save(encoded["input_ids"], f"{save_path}/input_ids_batch_{i}.pt")
    torch.save(encoded["attention_mask"], f"{save_path}/attention_mask_batch_{i}.pt")


    # Clear memory immediately
    del encoded
    gc.collect()
    torch.cuda.empty_cache()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

 39%|███▉      | 47/120 [08:39<23:39, 19.44s/it]

In [None]:
# since processing with batch size of 4 was not possible due to less RAM, whereas the requirement was more to load that much data
# batch size was reduced to 1 for further samples to occupy less RAM and save separate embeddings for every row
# Prepare tokenizer and directory
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-pretrain")
texts = df["text"].fillna("").tolist()
batch_size = 4
process_batch = 1
start_batch = 48
start_index = start_batch * batch_size

for i in tqdm(range(start_index, len(texts), process_batch)):
    batch_texts = texts[i:i + process_batch]
    encoded = tokenizer(
        batch_texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    # Save input_ids and attention_mask separately
    torch.save(encoded["input_ids"], f"{save_path}/input_ids_batch_{i}.pt")
    torch.save(encoded["attention_mask"], f"{save_path}/attention_mask_batch_{i}.pt")

    # Clear memory
    del encoded
    gc.collect()
    torch.cuda.empty_cache()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
100%|██████████| 288/288 [10:58<00:00,  2.29s/it]
