In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import nltk
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import time

# Download tokenizer
#nltk.download('punkt_tab')

# Load dataset (assumes a CSV file with 'text' and 'label' columns)
df = pd.read_csv("train.tsv", sep='\t')  # Replace with actual file
#print(df.head())
# Tokenization
tokenizer = get_tokenizer("basic_english")  # Or use nltk.word_tokenize

# Build Vocabulary
def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

vocab_whole = build_vocab_from_iterator(yield_tokens(df['text']), specials=["<pad>", "<unk>"])
vocab_whole.set_default_index(vocab_whole["<unk>"])  # Handle out-of-vocabulary words

In [33]:
def tokenize_with_ngrams(text, n=3):
    """
    Tokenizes text into words using NLTK, then generates character-level n-grams (subwords).
    """
    tokens = []
    
    # Tokenize text into words using NLTK
    words = word_tokenize(text)

    for word in words:
        # Add special boundary markers (like FastText)
        word = f"<s>{word}</s>"

        # Generate n-grams for each word
        ngrams = [word[i:i+n] for i in range(len(word) - n + 1)]

        tokens.extend(ngrams)  # Add n-grams to the list

    return tokens

# Function to generate n-gram tokens from a dataset
def yield_ngram_tokens(data, n=3):
    """
    Generator function to yield n-gram tokens for each text in the dataset.
    """
    for text in data:
        yield tokenize_with_ngrams(text, n)

# Build vocabulary from n-grams
vocab_sub = build_vocab_from_iterator(yield_ngram_tokens(df['text']), specials=["<pad>", "<unk>"])

# Set default index for unknown tokens
vocab_sub.set_default_index(vocab_sub["<unk>"])

In [34]:
class ToxicCommentDataset_sub(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize and convert to numerical indices
        tokenized = tokenize_with_ngrams(text, 3)
        indexed = [self.vocab[token] for token in tokenized]

        # Pad/truncate sequence to max_len
        if len(indexed) < self.max_len:
            indexed += [self.vocab["<pad>"]] * (self.max_len - len(indexed))
        else:
            indexed = indexed[:self.max_len]

        return torch.tensor(indexed, dtype=torch.long), torch.tensor(label, dtype=torch.float32)
    
class ToxicCommentDataset_whole(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize and convert to numerical indices
        tokenized = tokenizer(text)
        indexed = [self.vocab[token] for token in tokenized]

        # Pad/truncate sequence to max_len
        if len(indexed) < self.max_len:
            indexed += [self.vocab["<pad>"]] * (self.max_len - len(indexed))
        else:
            indexed = indexed[:self.max_len]

        return torch.tensor(indexed, dtype=torch.long), torch.tensor(label, dtype=torch.float32)

# Convert labels to numerical format
df['label'] = df['label'].astype(int)

# Create dataset

dataset_sub   = ToxicCommentDataset_sub(df['text'].tolist(), df['label'].tolist(), vocab_sub)
dataset_whole = ToxicCommentDataset_whole(df['text'].tolist(), df['label'].tolist(), vocab_whole)

# DataLoader
train_loader_sub   = DataLoader(dataset_sub, batch_size=32, shuffle=True)
train_loader_whole = DataLoader(dataset_whole, batch_size=32, shuffle=True)

In [35]:
class FastTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(FastTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        embeds = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        avg_embeds = embeds.mean(dim=1)  # Mean pooling over sequence
        out = self.fc(avg_embeds)  # Fully connected layer
        return torch.sigmoid(out).squeeze(1)  # Binary classification

# Initialize model
vocab_size_sub = len(vocab_sub)
vocab_size_whole = len(vocab_whole)
embed_dim = 100  # Can be 50, 100, or 300
model_sub = FastTextClassifier(vocab_size_sub, embed_dim, num_classes=1)
model_whole = FastTextClassifier(vocab_size_whole, embed_dim, num_classes=1)

In [36]:
#model.load_state_dict(torch.load("./trained_model.pth", weights_only=True))

In [37]:
criterion_sub = nn.BCELoss()
optimizer_sub = optim.Adam(model_sub.parameters(), lr=0.001)

criterion_whole = nn.BCELoss()
optimizer_whole = optim.Adam(model_whole.parameters(), lr=0.001)

# Training Function
def train_model(model, train_loader, criterion, optimizer, epochs=5, device="cpu"):
    model.to(device)
    for epoch in range(epochs):
        total_loss = 0
        model.train()
        start = time.time()
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}")
        print("Took " + str(round(time.time()-start, 2)) + " seconds")

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model_whole, train_loader_whole, criterion_whole, optimizer_whole, epochs=10, device=device)
torch.save(model_whole.state_dict(), "./trained_model_whole.pth")

train_model(model_sub, train_loader_sub, criterion_sub, optimizer_sub, epochs=10, device=device)
torch.save(model_sub.state_dict(), "./trained_model_sub.pth")

Epoch [1/10], Loss: 0.5450
Took 32.42 seconds
Epoch [2/10], Loss: 0.3563
Took 35.35 seconds
Epoch [3/10], Loss: 0.2654
Took 35.53 seconds
Epoch [4/10], Loss: 0.2216
Took 35.46 seconds
Epoch [5/10], Loss: 0.1962
Took 35.6 seconds
Epoch [6/10], Loss: 0.1783
Took 35.31 seconds
Epoch [7/10], Loss: 0.1654
Took 31.45 seconds
Epoch [8/10], Loss: 0.1560
Took 32.46 seconds
Epoch [9/10], Loss: 0.1476
Took 37.09 seconds
Epoch [10/10], Loss: 0.1395
Took 37.1 seconds
Epoch [1/10], Loss: 0.5694
Took 125.05 seconds
Epoch [2/10], Loss: 0.4848
Took 126.11 seconds
Epoch [3/10], Loss: 0.4630
Took 105.56 seconds
Epoch [4/10], Loss: 0.4510
Took 94.82 seconds
Epoch [5/10], Loss: 0.4427
Took 93.15 seconds
Epoch [6/10], Loss: 0.4364
Took 89.96 seconds
Epoch [7/10], Loss: 0.4310
Took 93.84 seconds
Epoch [8/10], Loss: 0.4266
Took 92.36 seconds
Epoch [9/10], Loss: 0.4230
Took 93.35 seconds
Epoch [10/10], Loss: 0.4194
Took 94.16 seconds


In [43]:
def predict(model, text, vocab, tokenizer, max_len=100, device="cpu"):
    model.to(device)
    model.eval()

    # Tokenize and convert to indices
    tokenized = tokenizer(text)
    indexed = [vocab[token] for token in tokenized]

    # Pad/truncate sequence
    if len(indexed) < max_len:
        indexed += [vocab["<pad>"]] * (max_len - len(indexed))
    else:
        indexed = indexed[:max_len]

    # Convert to tensor
    input_tensor = torch.tensor(indexed, dtype=torch.long).unsqueeze(0).to(device)

    # Get model prediction
    with torch.no_grad():
        output = model(input_tensor).item()

    return "Toxic" if output >= 0.5 else "Not Toxic"

# Example Prediction
sample_text = "<s>shit</s>"
print(predict(model, sample_text, vocab, tokenizer))

Not Toxic


In [None]:
# Load test dataset (assuming a CSV file with 'text' and 'label' columns)
df_test = pd.read_csv("dev.tsv", sep='\t')  # Replace with actual test file
# Convert labels to numerical format
df_test['label'] = df_test['label'].astype(int)

df_eng = df_test[df_test['id'].str.contains('eng')]
df_ger = df_test[df_test['id'].str.contains('ger')]
df_fin = df_test[df_test['id'].str.contains('fin')]

# Create test dataset
test_dataset_whole     = ToxicCommentDataset_whole(df_test['text'].tolist(), df_test['label'].tolist(), vocab_whole)
test_dataset_eng_whole = ToxicCommentDataset_whole(df_eng['text'].tolist(), df_eng['label'].tolist(), vocab_whole)
test_dataset_fin_whole = ToxicCommentDataset_whole(df_fin['text'].tolist(), df_fin['label'].tolist(), vocab_whole)
test_dataset_ger_whole = ToxicCommentDataset_whole(df_ger['text'].tolist(), df_ger['label'].tolist(), vocab_whole)

test_dataset_sub     = ToxicCommentDataset_sub(df_test['text'].tolist(), df_test['label'].tolist(), vocab_sub)
test_dataset_eng_sub = ToxicCommentDataset_sub(df_eng['text'].tolist(), df_eng['label'].tolist(), vocab_sub)
test_dataset_fin_sub = ToxicCommentDataset_sub(df_fin['text'].tolist(), df_fin['label'].tolist(), vocab_sub)
test_dataset_ger_sub = ToxicCommentDataset_sub(df_ger['text'].tolist(), df_ger['label'].tolist(), vocab_sub)

# Test DataLoader
test_loader_whole     = DataLoader(test_dataset_whole, batch_size=32, shuffle=False)
test_loader_eng_whole = DataLoader(test_dataset_eng_whole, batch_size=32, shuffle=False)
test_loader_ger_whole = DataLoader(test_dataset_ger_whole, batch_size=32, shuffle=False)
test_loader_fin_whole = DataLoader(test_dataset_fin_whole, batch_size=32, shuffle=False)

test_loader_sub     = DataLoader(test_dataset_sub, batch_size=32, shuffle=False)
test_loader_eng_sub = DataLoader(test_dataset_eng_sub, batch_size=32, shuffle=False)
test_loader_ger_sub = DataLoader(test_dataset_ger_sub, batch_size=32, shuffle=False)
test_loader_fin_sub = DataLoader(test_dataset_fin_sub, batch_size=32, shuffle=False)

In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader, device="cpu"):
    model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)

            outputs = model(texts)  # Forward pass
            preds = (outputs >= 0.5).long()  # Convert probabilities to binary labels

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute Metrics
    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Test Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Run the evaluation
print('Full score whole words:')
evaluate_model(model_whole, test_loader_whole)
print('\nEnglish scores whole words:')
evaluate_model(model_whole, test_loader_eng_whole)
print('\nGerman scores whole words:')
evaluate_model(model_whole, test_loader_ger_whole)
print('\nFinnish scores whole words:')
evaluate_model(model_whole, test_loader_fin_whole)

print('\nFull scores subwords:')
evaluate_model(model_sub, test_loader_sub)
print('\nEnglish scores subwords:')
evaluate_model(model_sub, test_loader_eng_sub)
print('\nGerman scores subwords:')
evaluate_model(model_sub, test_loader_ger_sub)
print('\nFinnish scores subwords:')
evaluate_model(model_sub, test_loader_fin_sub)

Full score whole words:
Test Accuracy: 0.8501
Precision: 0.7620
Recall: 0.8183
F1 Score: 0.7892

English scores whole words:
Test Accuracy: 0.9166
Precision: 0.9114
Recall: 0.8454
F1 Score: 0.8772

German scores whole words:
Test Accuracy: 0.5160
Precision: 0.2947
Recall: 0.6720
F1 Score: 0.4098

Finnish scores whole words:
Test Accuracy: 0.5400
Precision: 0.7339
Recall: 0.6067
F1 Score: 0.6642

Full scores subwords:
Test Accuracy: 0.7701
Precision: 0.7193
Recall: 0.5404
F1 Score: 0.6171

English scores subwords:
Test Accuracy: 0.7842
Precision: 0.7375
Recall: 0.6020
F1 Score: 0.6629

German scores subwords:
Test Accuracy: 0.7255
Precision: 0.3631
Recall: 0.1300
F1 Score: 0.1915

Finnish scores subwords:
Test Accuracy: 0.4400
Precision: 0.8276
Recall: 0.3200
F1 Score: 0.4615
