In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import nltk
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import time

# Download tokenizer
nltk.download('punkt')

# Load dataset (assumes a CSV file with 'text' and 'label' columns)
df = pd.read_csv("train.tsv", sep='\t')  # Replace with actual file
#print(df.head())
# Tokenization
tokenizer = get_tokenizer("basic_english")  # Or use nltk.word_tokenize

# Build Vocabulary
def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(df['text']), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])  # Handle out-of-vocabulary words

[nltk_data] Downloading package punkt to /home/immonej8/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
class ToxicCommentDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize and convert to numerical indices
        tokenized = tokenizer(text)
        indexed = [vocab[token] for token in tokenized]

        # Pad/truncate sequence to max_len
        if len(indexed) < self.max_len:
            indexed += [vocab["<pad>"]] * (self.max_len - len(indexed))
        else:
            indexed = indexed[:self.max_len]

        return torch.tensor(indexed, dtype=torch.long), torch.tensor(label, dtype=torch.float32)

# Convert labels to numerical format
df['label'] = df['label'].astype(int)

# Create dataset
dataset = ToxicCommentDataset(df['text'].tolist(), df['label'].tolist(), vocab)

# DataLoader
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [3]:
class FastTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(FastTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        embeds = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        avg_embeds = embeds.mean(dim=1)  # Mean pooling over sequence
        out = self.fc(avg_embeds)  # Fully connected layer
        return torch.sigmoid(out).squeeze(1)  # Binary classification

# Initialize model
vocab_size = len(vocab)
embed_dim = 100  # Can be 50, 100, or 300
model = FastTextClassifier(vocab_size, embed_dim, num_classes=1)

In [4]:
model.load_state_dict(torch.load("./trained_model.pth", weights_only=True))

  return self.fget.__get__(instance, owner)()


<All keys matched successfully>

In [8]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Function
def train_model(model, train_loader, criterion, optimizer, epochs=5, device="cpu"):
    model.to(device)
    for epoch in range(epochs):
        total_loss = 0
        model.train()
        start = time.time()
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}")
        print("Took " + str(round(time.time()-start, 2)) + " seconds")

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, criterion, optimizer, epochs=10, device=device)
torch.save(model.state_dict(), "./trained_model.pth")

Epoch [1/10], Loss: 0.3556
Took 268.8 seconds
Epoch [2/10], Loss: 0.2678
Took 327.5 seconds
Epoch [3/10], Loss: 0.2232
Took 323.39 seconds
Epoch [4/10], Loss: 0.1976
Took 323.3 seconds
Epoch [5/10], Loss: 0.1803
Took 324.11 seconds
Epoch [6/10], Loss: 0.1665
Took 323.55 seconds
Epoch [7/10], Loss: 0.1561
Took 323.88 seconds
Epoch [8/10], Loss: 0.1466
Took 322.7 seconds
Epoch [9/10], Loss: 0.1402
Took 322.13 seconds
Epoch [10/10], Loss: 0.1329
Took 321.51 seconds


In [5]:
def predict(model, text, vocab, tokenizer, max_len=100, device="cpu"):
    model.to(device)
    model.eval()

    # Tokenize and convert to indices
    tokenized = tokenizer(text)
    indexed = [vocab[token] for token in tokenized]

    # Pad/truncate sequence
    if len(indexed) < max_len:
        indexed += [vocab["<pad>"]] * (max_len - len(indexed))
    else:
        indexed = indexed[:max_len]

    # Convert to tensor
    input_tensor = torch.tensor(indexed, dtype=torch.long).unsqueeze(0).to(device)

    # Get model prediction
    with torch.no_grad():
        output = model(input_tensor).item()

    return "Toxic" if output >= 0.5 else "Not Toxic"

# Example Prediction
sample_text = "shit"
print(predict(model, sample_text, vocab, tokenizer))

Toxic


In [11]:
# Load test dataset (assuming a CSV file with 'text' and 'label' columns)
df_test = pd.read_csv("dev.tsv", sep='\t')  # Replace with actual test file
# Convert labels to numerical format
df_test['label'] = df_test['label'].astype(int)

df_ger = df_test[df_test['id'].str.contains('ger')]
df_fin = df_test[df_test['id'].str.contains('fin')]
df_fin.head()

# Create test dataset
test_dataset = ToxicCommentDataset(df_ger['text'].tolist(), df_ger['label'].tolist(), vocab)

# Test DataLoader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader, device="cpu"):
    model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)

            outputs = model(texts)  # Forward pass
            preds = (outputs >= 0.5).long()  # Convert probabilities to binary labels

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute Metrics
    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Test Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Run the evaluation
evaluate_model(model, test_loader)

Test Accuracy: 0.7485
Precision: 0.4211
Recall: 0.0160
F1 Score: 0.0308
