In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

### helper functions
largely for plotting and calculating error metrics

In [2]:
def make_intercept(x):

    with_intercept = np.zeros((len(x), len(x[0])+1), dtype = x.dtype)
    with_intercept[:, 0] = 1
    with_intercept[:, 1:] = x

    return with_intercept

def plot_training(list1, name_of_list1, list2, name_of_list2, save_path):
    x_values = np.linspace(1, len(list1), len(list1))
    plt.plot(x_values, list1, label=name_of_list1)
    plt.plot(x_values, list2, label=name_of_list2)
    plt.legend()
    plt.savefig(save_path)
    plt.close()

def print_results(error_rates, reg, result_path, weights=None):
    """
    Prints and saves final metrics. Optionally prints top and bottom weights.
    Does not require vector_words.
    """
    import numpy as np

    # Write the results into the file
    with open(result_path, "w") as text_file:
        error_rate, false_negative, false_positive = error_rates

        # Print regularization and error rates to console
        print(f"Regularization = {reg}, error_rate, false_negative, false_positive: "
              f"{error_rate}, {false_negative}, {false_positive}")

        # Always write these to the file
        text_file.write(f"Regularization = {reg}, error_rate, false_negative, false_positive: "
                        f"{error_rate}, {false_negative}, {false_positive}\n")

        # Optionally print top/bottom weights if 'weights' is provided
        if weights is not None:
            # Sort indices of weights in ascending order
            max_indices = np.argsort(weights)

            print("Max values:")
            # If you consider weights[0] as 'bias', you can print it explicitly:
            print("  bias:", weights[0])

            # Print top 5 positive weights (excluding the 0th if it's bias)
            for i in range(1, 6):
                idx = max_indices[-i]
                print(f"  Index {idx}, weight {weights[idx]}")

            print("Min values:")
            for i in range(5):
                idx = max_indices[i]
                print(f"  Index {idx}, weight {weights[idx]}")

def plot_confusion_matrix(labels, predictions, save_path):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  # Ensure directory exists

    #Ground truth ie correct labels
    true_classes = labels[:, 0] 

    # Compute and plot confusion matrix
    cm = confusion_matrix(true_classes, predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues', values_format='d')
    plt.title("Confusion Matrix")
    plt.savefig(save_path)
    plt.close()

def plot_roc_curve(labels, soft_probs, save_path, csv_path, model_name):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  # Ensure directory exists
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    
    # Extract ground truth and predicted probabilities
    true_classes = labels[:, 0]  # First column is '1' for positive class
    pred_prob_class1 = soft_probs[:, 0]  # Class 1 probabilities
    
    # Compute ROC and AUC
    fpr, tpr, _ = roc_curve(true_classes, pred_prob_class1)
    roc_auc = auc(fpr, tpr)
    
    # Save ROC values to CSV
    df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'model': model_name, 'auc': roc_auc})
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode='a', header=False, index=False)  # Append without headers
    else:
        df.to_csv(csv_path, mode='w', header=True, index=False)  # Create new file with headers
    
    # Plot ROC Curve
    plt.figure()
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})', linewidth=2)
    plt.plot([0, 1], [0, 1], 'r--')  # Random baseline
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.savefig(save_path)
    plt.close()


### neural network

In [3]:
class Neural_Net:

    def __init__(self, input_dim, hidden_layer1_dim, hidden_layer2_dim, output_dim, regularization=0.5, lr=1e-3):
        self.reg = regularization
        self.start_lr = lr
        self.lr = lr
        self.weights = {}
        self.weights["layer_1"] = np.random.normal(size=(input_dim, hidden_layer1_dim))
        self.weights["bias_1"] = np.zeros(hidden_layer1_dim)
        self.weights["layer_2"] = np.random.normal(size=(hidden_layer1_dim, hidden_layer2_dim))
        self.weights["bias_2"] = np.zeros(hidden_layer2_dim)
        self.weights["layer_3"] = np.random.normal(size=(hidden_layer2_dim, output_dim))
        self.weights["bias_3"] = np.zeros(output_dim)

        print("self.weights[layer_1].shape", self.weights["layer_1"].shape)
        print("self.weights[bias_1].shape", self.weights["bias_1"].shape)
        print("self.weights[layer_2].shape", self.weights["layer_2"].shape)
        print("self.weights[bias_2].shape", self.weights["bias_2"].shape)
        print("self.weights[layer_3].shape", self.weights["layer_3"].shape)
        print("self.weights[bias_3].shape", self.weights["bias_3"].shape)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def softmax(self, x):
        max_value = np.max(x, axis=1, keepdims=True)
        exp_matrix = np.exp(np.subtract(x, max_value))  # subtract max for numerical stability
        sum_vector = np.sum(exp_matrix, axis=1, keepdims=True)
        softmax_x = exp_matrix / sum_vector
        return softmax_x
    
    def forward(self, x, labels):
        layer1 = np.matmul(x, self.weights["layer_1"]) + self.weights["bias_1"]
        sig1 = self.sigmoid(layer1)
        layer2 = np.matmul(sig1, self.weights["layer_2"]) + self.weights["bias_2"]
        sig2 = self.sigmoid(layer2)
        layer3 = np.matmul(sig2, self.weights["layer_3"]) + self.weights["bias_3"]
        soft3 = self.softmax(layer3)
        loss_list = [np.log(np.matmul(soft3[i], label) + 1e-10) for i, label in enumerate(labels)]
        loss = -np.mean(loss_list)
        return sig1, sig2, soft3, loss
    
    def backward(self, x, labels):
        sig1, sig2, soft3, _ = self.forward(x, labels)
        batch_lenth = len(labels)

        dO = soft3 - labels
        dB3 = np.sum(dO, axis=0) / batch_lenth
        dL3 = np.matmul(sig2.T, dO) / batch_lenth + 2 * self.reg * self.weights["layer_3"]

        dsig2 = sig2 * (1 - sig2) * np.matmul(dO, self.weights["layer_3"].T)
        dB2 = np.sum(dsig2, axis=0) / batch_lenth
        dL2 = np.matmul(sig1.T, dsig2) / batch_lenth + 2 * self.reg * self.weights["layer_2"]

        dsig = sig1 * (1 - sig1) * np.matmul(dsig2, self.weights["layer_2"].T)
        dB1 = np.sum(dsig, axis=0) / batch_lenth
        dL1 = np.matmul(x.T, dsig) / batch_lenth + 2 * self.reg * self.weights["layer_1"]
        
        return dL1, dB1, dL2, dB2, dL3, dB3
    
    def batch_gradient_descent(self, x, labels, batch_dim, epoch):
        indices = np.random.permutation(np.arange(len(labels)))
        for i in range(0, len(x), batch_dim):
            batch_indices = indices[i:i + batch_dim]
            data_batch = x[batch_indices]
            label_batch = labels[batch_indices]
            dL1, dB1, dL2, dB2, dL3, dB3 = self.backward(data_batch, label_batch)

            self.weights["layer_1"] -= dL1 * self.lr
            self.weights["bias_1"] -= dB1 * self.lr
            self.weights["layer_2"] -= dL2 * self.lr
            self.weights["bias_2"] -= dB2 * self.lr
            self.weights["layer_3"] -= dL3 * self.lr
            self.weights["bias_3"] -= dB3 * self.lr

    def train(self, train_x, train_labels, val_x, val_labels, total_epochs, batch_dim=400):
        if len(train_x) % batch_dim != 0:
            raise ValueError("dataset must be divisible by batch_dim")

        train_loss, train_error_rates = [None], [[None, None, None]]
        val_loss, val_error_rates = [None], [[None, None, None]]
        
        for epoch in range(total_epochs):
            self.batch_gradient_descent(train_x, train_labels, batch_dim, epoch)

            _, _, soft3, train_loss1 = self.forward(train_x, train_labels)
            train_loss.append(train_loss1)

            pred_list_train, train_e = self.predict(soft3, train_labels)
            train_error_rates.append(train_e)

            _, _, soft3, val_loss1 = self.forward(val_x, val_labels)
            val_loss.append(val_loss1)
            pred_list, val_e = self.predict(soft3, val_labels)
            val_error_rates.append(val_e)

            self.lr = self.start_lr * 1 / (epoch / 100 + 1)

        return (np.array(val_loss[1:]), np.array(train_loss[1:]), 
                np.array(val_error_rates[1:]), np.array(train_error_rates[1:]))
    
    def predict(self, soft2, labels):
        prediction_list = [0 if soft2[i][0] < 0.5 else 1 for i in range(len(labels))]
        errorlist = [
            [0, 0, 0] if labels[i][0] == [prediction_list[i]] else [1, labels[i][0], 1 - labels[i][0]]
            for i in range(len(labels))
        ]
        return prediction_list, np.sum(errorlist, axis=0) / len(errorlist)



### read embeddings
Run appropriate cell based on which dataset we're running for 

In [20]:
folder_name = "alldata/note"

#### 1. Pre-trained Bert hierarchical full text embeddings

In [21]:
suffix = "BertFullTextHier"
file_name = "semantic_BertHier_FullText.csv"
file_path = os.path.join(os.getcwd(), folder_name, file_name)
df = pd.read_csv(file_path)

#### 2. Finetuned Bert full text embeddings (Cross Entropy Loss)

In [10]:
suffix = "10krows_BertFinetuned1"
file_name = "fineTuned_semantic_Bert_FullText.csv"
file_path = os.path.join(os.getcwd(), folder_name, file_name)
df = pd.read_csv(file_path)
df.drop(df.columns[-1], axis=1, inplace=True)

#### 3. Finetuned Bert full text embeddings (Cosine Similarity Loss)

In [13]:
suffix = "10krows_BertFinetuned2"
file_name = "fineTuned_semantic_Bert_Cosine.csv"
file_path = os.path.join(os.getcwd(), folder_name, file_name)
df = pd.read_csv(file_path)
df.drop(df.columns[-1], axis=1, inplace=True)

Sample and ensure balanced dataset

In [14]:
# Sample 5000 rows from each class
df0 = df[df['Y'] == 0].sample(n=5000, random_state=42)
df1 = df[df['Y'] == 1].sample(n=5000, random_state=42)
sampled_df = pd.concat([df0, df1]).reset_index(drop=True)
print(sampled_df['Y'].value_counts())

Y
0    5000
1    5000
Name: count, dtype: int64


In [15]:
df = sampled_df
len(df)

10000

#### 4. OPENAI 500 rows full text embedding

In [120]:
suffix = "OpenAIfulltext"
file_name = "OPENAI_merged_500embedding.csv"
file_path = os.path.join(os.getcwd(), folder_name, file_name)
df = pd.read_csv(file_path)

#### 5. OPENAI 500 rows summary embedding

In [125]:
suffix = "OpenAIsummary"
file_name = "OPENAI_merged_500_summary_embedding.csv"
file_path = os.path.join(os.getcwd(), folder_name, file_name)
df = pd.read_csv(file_path)

In [22]:
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,Y
0,0.186606,0.085287,-0.136578,0.114657,0.078886,-0.039044,0.111851,0.070599,0.084644,-0.01935,...,0.015664,-0.259481,0.042405,0.033597,0.030835,0.149914,0.223261,0.122565,-0.01984,1
1,0.206593,0.026372,0.024923,0.170108,0.071886,-0.061216,0.113776,0.092842,0.062146,0.004418,...,-0.015256,-0.130282,-0.003898,0.104951,-0.032151,0.072405,0.173873,0.109439,-0.069683,1


#### appropriately update file structure to save in corresponding folders

In [23]:
#Create folder names based on suffix
results_folder = f"results_{suffix}"
saved_weights_folder = f"saved_weights_{suffix}"

#Ensure directories exist
os.makedirs(results_folder, exist_ok=True)
os.makedirs(saved_weights_folder, exist_ok=True)

In [24]:
# The last column is Y, so let's separate features (all but last column) and labels (last column)
data = df.iloc[:, :-1].values
labels_raw = df.iloc[:, -1].values  # shape (n_samples,)

# Convert each label into a 2D form: [label, 1 - label]
labels = np.array([[lbl, 1 - lbl] for lbl in labels_raw])

### train model
- Saved weights parameter -> use/ not use cached weights from prior run
- batch_dim depends on dataset size
- openAI datasets don't converge with learning rates 0.5, 0.1

In [25]:
# We know our feature dimension is simply the number of columns (except Y)
feature_size = data.shape[1]

# Hyperparameters
hidden_layer1_dim = 50
hidden_layer2_dim = 50
learning_rate = 0.01
regularization_terms = [0.0025] 
max_epoch = 1000
batch_dim = 400  #fulltext hier Bert or any dataset with 10k rows
#batch_dim = 1

saved_parameters = False  # or False
k_folds = 5  # Number of folds for cross-validation

for reg in regularization_terms:
    NN = Neural_Net(feature_size, hidden_layer1_dim, hidden_layer2_dim, 2, reg, learning_rate)

    if saved_parameters:
        
        #Load the *full checkpoint* (weights + logs)
        loaded_dict = np.load(f"{saved_weights_folder}/checkpoint_{reg}.npy", allow_pickle=True).item()
        NN.weights = loaded_dict["weights"]
        val_loss = loaded_dict["val_loss"]
        train_loss = loaded_dict["train_loss"]
        val_error_rates = loaded_dict["val_error_rates"]
        train_error_rates = loaded_dict["train_error_rates"]

        #Evaluate on validation set
        val_set, val_labels = data, labels  # Use full dataset for evaluation
        _, _, soft3_val, val_loss_val = NN.forward(val_set, val_labels)
        pred_list_val, val_e = NN.predict(soft3_val, val_labels)

    else:

        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        X, Y = np.array(data), np.array(labels)

        # Store performance across folds
        all_val_losses, all_train_losses = [], []
        all_val_errors, all_train_errors = [], []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
            print(f"Fold {fold+1}/{k_folds}")

            # Split data into train and validation for this fold
            X_train, Y_train = X[train_idx], Y[train_idx]
            X_val, Y_val = X[val_idx], Y[val_idx]
            #batch_dim = len(X_train)

            # Train on the fold
            val_loss, train_loss, val_error_rates, train_error_rates = NN.train(
                X_train, Y_train,
                X_val, Y_val,
                max_epoch, batch_dim
            )

            # Store results from this fold
            all_val_losses.append(val_loss)
            all_train_losses.append(train_loss)
            all_val_errors.append(val_error_rates)
            all_train_errors.append(train_error_rates)

            # Ensure we have a final validation set after cross-validation
            val_set, val_labels = X_val, Y_val  # Save last fold's val set

        #Average results across folds
        val_loss = np.mean(all_val_losses, axis=0)
        train_loss = np.mean(all_train_losses, axis=0)
        val_error_rates = np.mean(all_val_errors, axis=0)
        train_error_rates = np.mean(all_train_errors, axis=0)

        #Evaluate on validation set after all folds
        _, _, soft3_val, _ = NN.forward(val_set, val_labels)
        pred_list_val, _ = NN.predict(soft3_val, val_labels)

        #Save trained weights & logs
        np.save(f"{saved_weights_folder}/weights_{reg}.npy", NN.weights)

        save_dict = {
            "weights": NN.weights,
            "val_loss": val_loss,
            "train_loss": train_loss,
            "val_error_rates": val_error_rates,
            "train_error_rates": train_error_rates
        }
        np.save(f"{saved_weights_folder}/checkpoint_{reg}.npy", save_dict)

    #Shared code: plots & confusion matrix & ROC
    plot_training(
        val_loss, "val_loss",
        train_loss, "train_loss",
        f"{results_folder}/NeuralNet_reg{reg}_loss.png"
    )

    plot_training(
        val_error_rates[:, 0], "val_error",
        train_error_rates[:, 0], "train_error",
        f"{results_folder}/NeuralNet_reg{reg}_errors.png"
    )

    plot_confusion_matrix(
        val_labels,
        pred_list_val,
        f"{results_folder}/NeuralNet_reg{reg}_confusion.png"
    )

    plot_roc_curve(
        val_labels,
        soft3_val,
        f"{results_folder}/NeuralNet_reg{reg}_roc.png",
        f"{results_folder}/NeuralNet_reg{reg}_roc.csv",
        suffix,
    )

    # Print final validation error rates
    error_rate, false_negative, false_positive = val_error_rates[-1]
    print(f"Regularization: {reg}")
    print(f"Error Rate: {error_rate:.3f}")
    print(f"False Negative: {false_negative:.3f}")
    print(f"False Positive: {false_positive:.3f}")

    # Write results to a txt file
    with open(f"{results_folder}/NeuralNet_reg{reg}.txt", "w") as text_file:
        text_file.write(f"Regularization: {reg}\n")
        text_file.write(f"Error Rate: {error_rate:.3f}\n")
        text_file.write(f"False Negative: {false_negative:.3f}\n")
        text_file.write(f"False Positive: {false_positive:.3f}\n")


self.weights[layer_1].shape (768, 50)
self.weights[bias_1].shape (50,)
self.weights[layer_2].shape (50, 50)
self.weights[bias_2].shape (50,)
self.weights[layer_3].shape (50, 2)
self.weights[bias_3].shape (2,)
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
Regularization: 0.0025
Error Rate: 0.366
False Negative: 0.123
False Positive: 0.244
