In [None]:
# Libraries
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch
from logging import raiseExceptions

In [None]:
# Pulling Data in from Google Sheets
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSTmtBa7FYVX9Y7ALY7lnsQ9j4A3AeeqPRANZqscbNfhU2wtbbCinlHkLOatGZEcscZTcsRdJLHYY17/pubhtml"
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")

train = []
test = []

tables = soup.find_all("table")
for index, table in enumerate(tables):
    for row in table.find_all("tr"):
        train.append([cell.text for cell in row.find_all("td")])

url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSObPeBI-ln_xscvnLARzH11ueaT_YsxPCbYVJF2e1MvmFil7Aq4fbC2eI6u3f0S3xe13VyhmUU1dOi/pubhtml"
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")

tables = soup.find_all("table")
for index, table in enumerate(tables):
    for row in table.find_all("tr"):
        test.append([cell.text for cell in row.find_all("td")])

In [None]:
def DataCleaner(data,isTraining):
  output_df = []
  y_output = []

  # Taking out the ID Variable
  for i in range(len(data)):
    if(isTraining):
      output_df.append(data[i][1:8])
      y_output.append(data[i][8])

      # Column 0
      if output_df[i][0] == '':
        output_df[i][0] = -1

      # Column 1
      if output_df[i][1] == 'Yes':
        output_df[i][1] = 1
      elif output_df[i][1] == 'No':
        output_df[i][1] = 0
      elif output_df[i][1] == '':
        output_df[i][1] = -1
      else:
        raiseExceptions

      # Column 2
      if output_df[i][2] == '':
        output_df[i][2] = -1

      # Column 3
      if output_df[i][3] == '':
        output_df[i][3] = -1

      # Column 4
      if output_df[i][4] == 'Yes':
        output_df[i][4] = 1
      elif output_df[i][4] == 'No':
        output_df[i][4] = 0
      elif output_df[i][4] == '':
        output_df[i][4] = -1
      else:
        raiseExceptions

      # Column 5
      if output_df[i][5] == '':
        output_df[i][5] = -1

      # Column 6
      if output_df[i][6] == '':
        output_df[i][6] = -1

    else:
        output_df.append(data[i][1:])

        # Column 0
        if output_df[i][0] == '':
          output_df[i][0] = -1

        # Column 1
        if output_df[i][1] == 'Yes':
          output_df[i][1] = 1
        elif output_df[i][1] == 'No':
          output_df[i][1] = 0
        elif output_df[i][1] == '':
          output_df[i][1] = -1
        else:
          raiseExceptions

        # Column 2
        if output_df[i][2] == '':
          output_df[i][2] = -1

        # Column 3
        if output_df[i][3] == '':
          output_df[i][3] = -1

        # Column 4
        if output_df[i][4] == 'Yes':
          output_df[i][4] = 1
        elif output_df[i][4] == 'No':
          output_df[i][4] = 0
        elif output_df[i][4] == '':
          output_df[i][4] = -1
        else:
          raiseExceptions

        # Column 5
        if output_df[i][5] == '':
          output_df[i][5] = -1

        # Column 6
        if output_df[i][6] == '':
          output_df[i][6] = -1

  if(isTraining):
    # Changing the Outcome Variable to Binary
    y_output = [1 if x == 'Extrovert' else 0 for x in y_output]

    # Returning output and Y values if training data
    return output_df, y_output

  else:
    #Just returning output if not training data
    return output_df

In [None]:
# Removing the Labels from the Data
train = train[1:]
test = test[1:]

# Running the Training and Testing Data through the Data Cleaner
train_x, train_y = DataCleaner(train, True)
test_x = DataCleaner(test, False)

# Converting data to numpy arrays
train_x = np.array(train_x)
train_x = train_x.astype(float)

train_y = np.array(train_y)
train_y = train_y.astype(float)

test_x = np.array(test_x)
test_x = test_x.astype(float)

In [None]:
class SimpleNeuralNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.dropout = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return F.log_softmax(self.fc3(x), dim=1)

In [None]:
# Intializing my Model, Criterion, Optimizer
model = SimpleNeuralNet(input_size=train_x.shape[1], num_classes=2)

# Calculate class weights
class_counts = torch.bincount(torch.tensor(train_y).long())
total_samples = len(train_y)
class_weights = total_samples / class_counts
# Normalize weights (optional, but can be helpful)
class_weights = class_weights / class_weights.sum()

criterion = nn.NLLLoss(weight=class_weights.float())  # Because we're using log_softmax, apply weights

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example feature and label tensors
X_tensor = torch.tensor(train_x).float()  # Convert to float
y_tensor = torch.tensor(train_y)

# Create dataset
train_dataset = TensorDataset(X_tensor, y_tensor)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=50, shuffle=True)

In [None]:
# Training loop
num_epochs = 1000

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch.long())  # Convert target to Long
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

In [None]:
# Values with batch_size at 2:
# Epoch 1, Loss: 0.1983
# Epoch 2, Loss: 0.1780
# Epoch 3, Loss: 0.1790
# Epoch 4, Loss: 0.1742
# Epoch 5, Loss: 0.1731

# Values with batch_size at 10:
# Epoch 1, Loss: 0.2108
# Epoch 2, Loss: 0.1930
# Epoch 3, Loss: 0.1880
# Epoch 4, Loss: 0.1803
# Epoch 5, Loss: 0.1765

# Values with batch_size at 25:
# Epoch 1, Loss: 0.2134
# Epoch 2, Loss: 0.1904
# Epoch 3, Loss: 0.1844
# Epoch 4, Loss: 0.1822
# Epoch 5, Loss: 0.1797

# Values with batch_size at 50:
# Epoch 1, Loss: 0.2236
# Epoch 2, Loss: 0.1933
# Epoch 3, Loss: 0.1874
# Epoch 4, Loss: 0.1829
# Epoch 5, Loss: 0.1812

In [None]:
# Generating my predictions now
model.eval()

test_tensor = torch.tensor(test_x).float()

with torch.no_grad():
    outputs = model(test_tensor)

predicted_classes = torch.argmax(outputs, dim=1)
predicted_classes = ['Introvert' if pred == 1 else 'Extrovert' for pred in predicted_classes]

# Getting the ids for the predictions
ids = []
for i in range(len(test)):
  ids.append(test[i][0])

# Combining the arrays to make a dataset to write to my csv for my submission
combined = np.column_stack((ids, predicted_classes))

# Convert to Pandas
final= pd.DataFrame(combined, columns=['id', 'Personality'])
final = final.set_index('id')

# Final output
final.to_csv("submission.csv")