In [1]:
!pip install pandas numpy torch scikit-learn nltk tqdm

[0m

In [2]:
# Install the specific NLTK version
!pip install nltk==3.8.0

[0m

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from tqdm import tqdm

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
# Load the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [7]:
train_data.head()

Unnamed: 0,id,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0


In [8]:
test_data.head()

Unnamed: 0,id,text
0,0,[ Integrity means that you pay your debts.]\n\...
1,1,This is malfeasance by the Administrator and t...
2,2,@Rmiller101 - Spoken like a true elitist. But ...
3,3,"Paul: Thank you for your kind words. I do, in..."
4,4,Sorry you missed high school. Eisenhower sent ...


In [9]:
print(train_data['text'].isnull().sum())

3


In [22]:
# **Fill missing values in the 'text' column**
train_data['text'] = train_data['text'].fillna('')
test_data['text'] = test_data['text'].fillna('')

In [23]:
# Define the target labels
target_labels = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

In [24]:
# Preprocess the text
def preprocess_text(text):
    if not isinstance(text, str):
        return []
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    tokens = word_tokenize(text, preserve_line=True)  # Tokenize without sentence segmentation
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

train_data['tokens'] = train_data['text'].apply(preprocess_text)
test_data['tokens'] = test_data['text'].apply(preprocess_text)

In [25]:
# Build vocabulary
all_tokens = [token for tokens in train_data['tokens'] for token in tokens]
token_counts = Counter(all_tokens)
vocab = {token: idx+1 for idx, (token, _) in enumerate(token_counts.items())}  # +1 for padding

In [26]:
# Convert tokens to sequences of indices
def tokens_to_indices(tokens):
    return [vocab.get(token, 0) for token in tokens]  # 0 for unknown words

train_data['sequence'] = train_data['tokens'].apply(tokens_to_indices)
test_data['sequence'] = test_data['tokens'].apply(tokens_to_indices)

In [27]:
# Pad sequences
def pad_sequence(sequence, max_len=400):  
    if len(sequence) > max_len:
        return sequence[:max_len]
    return sequence + [0] * (max_len - len(sequence))

train_data['sequence'] = train_data['sequence'].apply(lambda x: pad_sequence(x, max_len=400))
test_data['sequence'] = test_data['sequence'].apply(lambda x: pad_sequence(x, max_len=400))

In [28]:
# Define the Dataset class
class ToxicCommentDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.sequences[idx], dtype=torch.long),
            torch.tensor(self.targets[idx], dtype=torch.float)
        )

In [29]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(
    train_data['sequence'].tolist(),
    train_data[target_labels].values,
    test_size=0.1,
    random_state=42
)

train_dataset = ToxicCommentDataset(X_train, y_train)
val_dataset = ToxicCommentDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Reduced batch size for better gradient updates
val_loader = DataLoader(val_dataset, batch_size=16)

In [30]:
class BiLSTM_GRU_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, lstm_hidden_dim=128, gru_hidden_dim=128, cnn_out_channels=128, output_dim=7, dropout_rate=0.5):
        super(BiLSTM_GRU_Model, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # BiLSTM layer
        self.bilstm = nn.LSTM(embedding_dim, lstm_hidden_dim, bidirectional=True, batch_first=True)
        
        # GRU layer
        self.gru = nn.GRU(lstm_hidden_dim * 2, gru_hidden_dim, bidirectional=True, batch_first=True)
        
        # Dropout after GRU
        self.dropout_gru = nn.Dropout(dropout_rate)
        
        # Convolutional layer
        self.conv = nn.Conv1d(in_channels=gru_hidden_dim * 2, out_channels=cnn_out_channels, kernel_size=3, padding=1)
        
        # Dropout after CNN
        self.dropout_cnn = nn.Dropout(dropout_rate)
        
        # Fully connected layer
        self.fc = nn.Linear(cnn_out_channels * 400, output_dim)  # Adjusted for increased max_len
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        gru_out, _ = self.gru(lstm_out)
        gru_out = self.dropout_gru(gru_out)
        
        # Change the shape for convolutional layer
        gru_out = gru_out.permute(0, 2, 1)
        conv_out = self.conv(gru_out)
        conv_out = self.dropout_cnn(conv_out)
        
        # Flatten
        conv_out = conv_out.view(conv_out.size(0), -1)
        out = self.fc(conv_out)
        return self.sigmoid(out)

In [31]:
# Calculate class weights for imbalanced dataset
class_counts = train_data[target_labels].sum(axis=0).values
class_weights = 1.0 / class_counts
weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Initialize the model, loss function, and optimizer
model = BiLSTM_GRU_Model(len(vocab) + 1).to(device)
criterion = nn.BCELoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)  # Reduced learning rate for finer updates

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)  # More frequent learning rate reduction

In [33]:
# Training loop
num_epochs = 3  # Increased epochs for more training iterations
best_val_accuracy = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for sequences, targets in tqdm(train_loader):
        sequences, targets = sequences.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Step the scheduler
    scheduler.step()

    # Validation
    model.eval()
    val_loss = 0
    val_preds, val_targets = [], []
    with torch.no_grad():
        for sequences, targets in val_loader:
            sequences, targets = sequences.to(device), targets.to(device)
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

            val_preds.extend(outputs.cpu().numpy())
            val_targets.extend(targets.cpu().numpy())

    # Convert lists to numpy arrays
    val_preds = np.concatenate(val_preds, axis=0)
    val_targets = np.concatenate(val_targets, axis=0)

    # Binarize predictions and targets
    val_preds = (val_preds >= 0.5).astype(int)
    val_targets = val_targets.astype(int)

    # Compute metrics
    val_accuracy = accuracy_score(val_targets, val_preds)
    val_f1_score = f1_score(val_targets, val_preds, average='micro')
    val_hamming_loss = hamming_loss(val_targets, val_preds)

    print(
        f"Epoch {epoch + 1}/{num_epochs}, "
        f"Train Loss: {train_loss / len(train_loader):.4f}, "
        f"Validation Loss: {val_loss / len(val_loader):.4f}, "
        f"Validation Accuracy: {val_accuracy:.4f}, "
        f"Validation Hamming Loss: {val_hamming_loss:.4f}, "
        f"Validation F1 Score: {val_f1_score:.4f}"
    )

    # Save the best model based on validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), "best_model.pth")

# Load the best model for predictions
model.load_state_dict(torch.load("best_model.pth"))

100%|██████████| 101525/101525 [35:59<00:00, 47.01it/s]


Epoch 1/3, Train Loss: 0.0000, Validation Loss: 0.0000, Validation Accuracy: 0.9853, Validation Hamming Loss: 0.0147, Validation F1 Score: 0.9853


100%|██████████| 101525/101525 [36:05<00:00, 46.88it/s]


Epoch 2/3, Train Loss: 0.0000, Validation Loss: 0.0000, Validation Accuracy: 0.9900, Validation Hamming Loss: 0.0100, Validation F1 Score: 0.9900


100%|██████████| 101525/101525 [35:55<00:00, 47.11it/s]


Epoch 3/3, Train Loss: 0.0000, Validation Loss: 0.0000, Validation Accuracy: 0.9872, Validation Hamming Loss: 0.0128, Validation F1 Score: 0.9872


<All keys matched successfully>

In [34]:
# Make predictions on the test data
test_sequences = torch.tensor(test_data['sequence'].tolist(), dtype=torch.long).to(device)
model.eval()
predictions = []
with torch.no_grad():
    for i in range(0, len(test_sequences), 16):  # Reduced batch size for better memory usage during inference
        batch = test_sequences[i:i + 16]
        outputs = model(batch)
        predictions.extend(outputs.cpu().numpy())

# Convert predictions to numpy array
predictions = np.array(predictions)

# Binarize predictions
predictions = (predictions >= 0.5).astype(int)

# Create submission DataFrame
pred_df = pd.DataFrame(predictions, columns=target_labels)

# Include 'id' column if present in test data
if 'id' in test_data.columns:
    pred_df.insert(0, 'id', test_data['id'].values)

# Save predictions to CSV
pred_df.to_csv("submission.csv", index=False)