In [1]:
import sys
from pathlib import Path

sys.path.append(str(Path("..").resolve()))

from utils.document_classification import *
from langchain.schema import Document

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [2]:
def load_data(root_dir):
    documents = []

    root_dir = Path(root_dir)

    for label_dir in root_dir.iterdir():
        if not label_dir.is_dir():
            continue

        label = label_dir.name
        
        # load the file and get the document object
        for file_path in label_dir.rglob("*"):
            if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
                continue
            docs = load_file(str(file_path))

            for doc in docs:
                doc.metadata["label"] = label
                doc.metadata["source"] = str(file_path)
                documents.append(doc)

    return documents

In [3]:
loaded_data = load_data("data")
data_df = pd.DataFrame(
    {
        "page_content": [remove_redundant_space(format_string(process_single_quotes(doc.page_content))) for doc in loaded_data],
        "label": [doc.metadata["label"] for doc in loaded_data],
    }
)
data_df.head()

Skipping file due to load error: data/Research Paper/13_Understanding LSTM Networks -- colahs blog.pdf (Odd-length string)


Unnamed: 0,page_content,label
0,united states securities and exchange commissi...,SEC Filing
1,qualcomm incorporated form 10-q for the quarte...,SEC Filing
2,risk factors summary:our business is subject t...,SEC Filing
3,risks related to regulatory and legal challeng...,SEC Filing
4,part i.financial information item 1.condensed ...,SEC Filing


In [4]:
data_df.groupby("label").size()

label
Earnings Call Transcript    1185
News Article                2926
Press Release                 24
Research Paper              1265
SEC Filing                  4091
dtype: int64

In [5]:
# Split data into training and testing sets
train_x, test_x, train_y, test_y = train_test_split(
    data_df["page_content"], 
    data_df["label"], 
    test_size=0.2, 
    random_state=42
)

# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(min_df = 5, max_df = .80, norm = 'l2', sublinear_tf = True, stop_words='english')
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print("Training data shape:", train_x_vectors.shape)
print("Testing data shape:", test_x_vectors.shape)

# Train a logistic regression model
model = LogisticRegression(C = 0.5, penalty = 'l2', solver='liblinear')
model.fit(train_x_vectors, train_y)

# Evaluate model performance
train_accuracy = model.score(train_x_vectors, train_y)
test_accuracy = model.score(test_x_vectors, test_y)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

Training data shape: (7592, 27612)
Testing data shape: (1899, 27612)
Training Accuracy: 0.9934
Testing Accuracy: 0.9889


In [6]:
# Balanced error rate (BER) calculation
def balanced_error_rate(y_true, y_pred):
    from sklearn.metrics import confusion_matrix
    import numpy as np

    cm = confusion_matrix(y_true, y_pred)
    per_class_errors = []

    for i in range(len(cm)):
        tn = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
        fp = np.sum(cm[:, i]) - cm[i, i]
        fn = np.sum(cm[i, :]) - cm[i, i]
        tp = cm[i, i]

        if (tp + fn) == 0:
            fnr = 0
        else:
            fnr = fn / (tp + fn)

        if (tn + fp) == 0:
            fpr = 0
        else:
            fpr = fp / (tn + fp)

        per_class_errors.append((fnr + fpr) / 2)

    ber = np.mean(per_class_errors)
    return ber

In [7]:
balanced_error_rate = balanced_error_rate(test_y, model.predict(test_x_vectors))
print(f"Balanced Error Rate (BER): {balanced_error_rate:.4f}")

Balanced Error Rate (BER): 0.1062


In [8]:
# Save the trained model and vectorizer
import joblib
joblib.dump(model, "Document_Classifier.joblib")
joblib.dump(vectorizer, "TFIDF_Vectorizer.joblib")

['TFIDF_Vectorizer.joblib']