In [1]:
import sys
sys.path.append('..')

In [None]:
import pandas as pd
import numpy as np
import numpy.typing as npt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,label_binarize
from src.pipeline import preprocessor

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F


In [4]:
# Function to load and split the data
def load_data():
    df = pd.read_csv('../data/Tweets.csv')
    df = df.drop(columns=['tweet_id'])
    df_train, df_test = train_test_split(df, test_size=0.1, stratify=df[['airline_sentiment']], random_state=0)

    X_train = df_train.drop(columns=['airline_sentiment', 'airline_sentiment_confidence'])
    y_train = df_train[['airline_sentiment']]

    X_test = df_test.drop(columns=['airline_sentiment', 'airline_sentiment_confidence'])
    y_test = df_test[['airline_sentiment']]

    return X_train, y_train, X_test, y_test

In [5]:
# Load the data
X_train, y_train, X_test, y_test = load_data()

# Apply preprocessing to the training data
X_train_processed = torch.tensor(preprocessor.fit_transform(X_train).toarray(), dtype=torch.float32)
X_test_processed = torch.tensor(preprocessor.transform(X_test).toarray(), dtype=torch.float32)

In [6]:
# Encode target labels
ohe = OneHotEncoder(sparse_output=False)
y_train_ohe = torch.tensor(ohe.fit_transform(y_train), dtype=torch.float32)
y_test_ohe = torch.tensor(ohe.transform(y_test), dtype=torch.float32)

le = LabelEncoder()
y_train_labels = le.fit_transform(y_train.values.flatten())
y_test_labels = le.transform(y_test.values.flatten())

In [7]:
class FNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fnn = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(64, output_dim),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype=torch.float)
        x = self.fnn(x)
        return x

In [8]:
def fit_neural_network(
        model, optimizer, criterion,
        X: npt.NDArray, y: npt.NDArray, epochs: int, batch_size: int):
    
    n_batches = np.ceil(len(X) / batch_size).astype(np.int32)
    max_acc = 0.0
    for epoch in range(epochs):
        model.train()
        print(f"epoch {epoch+1}/{epochs}")
        permutation_idx = torch.randperm(X.size(0))
        X_shuffled = X[permutation_idx]
        y_shuffled = y[permutation_idx]

        running_loss = 0.0
        for i in range(0, X.size(0), batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        # print(f"average loss: {running_loss/n_batches}")

        model.eval()
        y_pred = np.array(torch.argmax(model(X_test_processed), dim=1))
        f1s = f1_score(y_test_labels, y_pred, average='weighted')
        acc = accuracy_score(y_test_labels, y_pred)
        print(f"f1: {f1s:.3f}\tacc: {acc:.3f}")
        if acc > max_acc:
            max_acc = acc
            torch.save(model, 'best_model.pt')

        running_loss = 0

In [9]:
model = FNN(X_train_processed.shape[1], 3)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

In [None]:
fit_neural_network(model, optimizer, criterion, X_train_processed, y_train_ohe, epochs=500, batch_size=128)

In [None]:
model = torch.load("best_model.pt", weights_only=False)
model.eval()

y_pred = np.array(torch.argmax(model(X_test_processed), dim=1))

accuracy_score(y_test_labels, y_pred)
f1_score(y_test_labels, y_pred, average='weighted')

In [None]:
# Generating a confusion matrix
cm = confusion_matrix(y_test_labels, y_pred)

# Visualizing the confusion matrix using seaborn
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.ylabel('Actual Labels')
plt.xlabel('Predicted Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Predicting on the training data
y_train_pred = model(X_train_processed)
y_train_pred = np.array(torch.argmax(y_train_pred, dim=1))

# Calculating accuracy on the training data
train_accuracy = accuracy_score(y_train_labels, y_train_pred)
print(f"Accuracy on training data: {train_accuracy:.4f}")

# Predicting on the test data
y_test_pred = model(X_test_processed)
y_test_pred = np.array(torch.argmax(y_test_pred, dim=1))

# Calculating accuracy on the test data
test_accuracy = accuracy_score(y_test_labels, y_test_pred)
print(f"Accuracy on test data: {test_accuracy:.4f}")

# Generating a classification report
print("\nClassification report for the test data:")
print(classification_report(y_test_labels, y_test_pred, target_names=le.classes_))

# Generating a confusion matrix
cm = confusion_matrix(y_test_labels, y_test_pred)

# Visualizing the confusion matrix using seaborn
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.ylabel('Actual Labels')
plt.xlabel('Predicted Labels')
plt.title('Confusion Matrix')
plt.show()

# # Calculating AUC for multi-class classification
# y_test_prob = clf.predict_proba(X_test_processed)

# auc = roc_auc_score(y_test_processed, y_test_prob, multi_class='ovr')
# print(f"AUC (One-vs-Rest) on test data: {auc:.4f}")


# Area under the curve (AUC)

In [50]:
model.eval()

with torch.no_grad():
    y_pred_probs = model(X_test_processed) 

y_pred_probs = y_pred_probs.cpu().numpy()  

In [55]:


num_classes = 3
y_test_onehot = label_binarize(y_test_labels, classes=[0, 1, 2])  

In [None]:


auc_ovr_weighted = roc_auc_score(y_test_onehot, y_pred_probs, multi_class="ovr", average="weighted")
print("Weighted One-vs-Rest AUC:", auc_ovr_weighted)

# Gini Coefficient

In [None]:
gini_ovr_weighted = 2 * auc_ovr_weighted - 1
print("Weighted One-vs-Rest Gini:", gini_ovr_weighted)

In [None]:
auc_per_class = roc_auc_score(y_test_onehot, y_pred_probs, multi_class="ovr", average=None)

gini_per_class = 2 * auc_per_class - 1

for i, (auc_val, gini_val) in enumerate(zip(auc_per_class, gini_per_class)):
    print(f"Class {i}: AUC={auc_val:.3f}, Gini={gini_val:.3f}")