# Preprocessing

In [1]:
import pandas as pd
import torch
import re
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
train_path = 'train.csv'
test_path = 'test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

print("Train Dataset:", train_data.shape)
print("Test Dataset:", test_data.shape)

# Clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_data['text'] = train_data['text'].fillna('').apply(clean_text)
test_data['text'] = test_data['text'].fillna('').apply(clean_text)

# Build vocabulary and tokenize
def build_vocab(sentences, max_vocab_size=50000):
    counter = Counter()
    for sentence in sentences:
        counter.update(sentence.split())
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(counter.most_common(max_vocab_size))}
    vocab['<PAD>'] = 0
    return vocab

vocab = build_vocab(train_data['text'], max_vocab_size=50000)

def text_to_sequence(text, vocab):
    return [vocab[word] for word in text.split() if word in vocab]

train_data['sequence'] = train_data['text'].apply(lambda x: text_to_sequence(x, vocab))
test_data['sequence'] = test_data['text'].apply(lambda x: text_to_sequence(x, vocab))

# Labels for multilabel classification
labels = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']
y = train_data[labels].values
y = (y > 0.5).astype(int)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(train_data['sequence'], y, test_size=0.2, random_state=42)

# Dataset class
class CommentDataset(Dataset):
    def __init__(self, sequences, labels, max_len=512):
        self.sequences = [torch.tensor(seq[:max_len], dtype=torch.long) for seq in sequences]
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

def collate_fn(batch):
    sequences, labels = zip(*batch)
    sequences = pad_sequence(sequences, batch_first=True, padding_value=0).long()
    labels = torch.stack(labels)
    return sequences, labels

batch_size = 64

train_dataset = CommentDataset(X_train, y_train)
val_dataset = CommentDataset(X_val, y_val)

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

vocab_size = len(vocab)

Train Dataset: (1804874, 9)
Test Dataset: (97320, 2)


# Model

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

w/ conv

In [3]:
### Model tests

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(embedding_dim, hidden_dim, kernel_size = 3, padding="valid")
        self.avgpool = nn.AvgPool1d(2)
        self.lstm1 = nn.LSTM(hidden_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)

        self.fc1 = nn.Linear(hidden_dim*2, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 7)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.relu(x)

        x = self.avgpool(x)
        x = x.permute(0, 2, 1)

        x, (ht, hc) = self.lstm1(x)

        # for bidirectional
        hidden = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim = 1)
        hidden = self.dropout(hidden)

        output = self.fc1(hidden)
        output = self.dropout(output)
        output = self.relu(output)
        output = self.fc2(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.fc3(output)
        return output


#  Inference

In [None]:
def accuracy(predicted, actual):
    predictions = torch.sigmoid(predicted)
    predictions_binary = (predictions > 0.5).float()
    correct = (predictions_binary == actual).float().sum()
    total = actual.numel()
    return correct / total


def focal_binary_cross_entropy(logits, targets, alpha = 1, gamma=2):
    bce_loss = nn.BCEWithLogitsLoss(reduction='none')(logits, targets)
    pt = torch.exp(-bce_loss)
    focal = alpha * (1-pt)**gamma * bce_loss
    loss = focal.mean()
    return loss


def train_model(model, train_dl, val_dl, lr=0.001, num_epochs=3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0
        val_loss = 0
        train_acc = 0
        val_acc = 0
        for x_batch, y_batch in train_dl:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device).float()

            output = model(x_batch)
            loss = focal_binary_cross_entropy(output, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            train_acc += accuracy(output, y_batch).item()

        model.eval()
        for xval_batch, yval_batch in val_dl:
            xval_batch = xval_batch.to(device)
            yval_batch = yval_batch.to(device).float()

            output = model(xval_batch)
            loss = focal_binary_cross_entropy(output, yval_batch)

            val_loss += loss.item()
            val_acc += accuracy(output, yval_batch).item()

        print(f"Epoch: {epoch+1}, Train Loss: {running_loss/len(train_dl)}, Train Acc: {train_acc/len(train_dl)}, Val Loss: {val_loss/len(val_dl)}, Val Acc: {val_acc/len(val_dl)}")








In [None]:
model = LSTMModel(vocab_size, 512, 512)
model.to(device)

LSTMModel(
  (embedding): Embedding(50001, 512)
  (conv1): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=valid)
  (avgpool): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  (lstm1): LSTM(512, 512, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=7, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
train_model(model, train_dl, val_dl, lr=0.0001, num_epochs=3)

Epoch: 1, Train Loss: 0.011916747608985782, Train Acc: 0.9878492992726166, Val Loss: 0.0093505106980443, Val Acc: 0.989038545640304
Epoch: 2, Train Loss: 0.008481274089425263, Train Acc: 0.989717175790823, Val Loss: 0.008422755174751316, Val Acc: 0.9897690068172914
Epoch: 3, Train Loss: 0.0075113286404278655, Train Acc: 0.9903123132314176, Val Loss: 0.008078490154353113, Val Acc: 0.9899652735220373


In [None]:
test_dataset = CommentDataset(test_data['sequence'], np.zeros((len(test_data), len(labels))))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

model.eval()
predictions = []
with torch.no_grad():
    for sequences, _ in test_loader:
        sequences = sequences.to(device)
        outputs = model(sequences)

        # forgot to add this
        outputs = torch.sigmoid(outputs)
        predictions.append(outputs.cpu().numpy())

predictions = np.vstack(predictions)
predictions_binary = (predictions > 0.5).astype(int)

# Save predictions
test_results = pd.DataFrame(predictions_binary, columns=labels)
test_results.insert(0, 'id', test_data['id'])
test_results.to_csv('predictions.csv', index=False)


In [None]:
torch.save(model, 'model.pt')

In [4]:
model = torch.load('model.pt')
model.to(device)

  model = torch.load('model.pt')


LSTMModel(
  (embedding): Embedding(50001, 512)
  (conv1): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=valid)
  (avgpool): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  (lstm1): LSTM(512, 512, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=7, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
)

In [5]:
test_dataset = CommentDataset(test_data['sequence'], np.zeros((len(test_data), len(labels))))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

model.eval()
predictions = []
with torch.no_grad():
    for sequences, _ in test_loader:
        sequences = sequences.to(device)
        outputs = model(sequences)

        # forgot to add this
        outputs = torch.sigmoid(outputs)
        predictions.append(outputs.cpu().numpy())

predictions = np.vstack(predictions)
predictions_binary = (predictions > 0.5).astype(int)

# Save predictions
test_results = pd.DataFrame(predictions_binary, columns=labels)
test_results.insert(0, 'id', test_data['id'])
test_results.to_csv('predictions_3.csv', index=False)

display(test_results)

Unnamed: 0,id,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
97315,97315,0,0,0,0,0,0,0
97316,97316,0,0,0,0,0,0,0
97317,97317,1,0,0,0,1,0,0
97318,97318,0,0,0,0,0,0,0


In [6]:
test_results = pd.DataFrame(predictions, columns=labels)
test_results.insert(0, 'id', test_data['id'])
test_results.to_csv('predictions_prob.csv', index=False)