In [11]:
# Cell 1: Install required packages
!pip install transformers
!pip install accelerate -U  # Required for proper model training




In [12]:
import pandas as pd
import torch
import numpy as np
import re
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from string import punctuation
import matplotlib.pyplot as plt



In [13]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
class Config:
    # File paths
    input_dir = Path('/kaggle/input/gods2025')
    output_dir = Path('/kaggle/working/')
    
    # Model parameters
    model_name = 'roberta-base'  # Changed to RoBERTa
    max_length = 256
    train_batch_size = 16
    valid_batch_size = 32
    epochs = 15
    learning_rate = 2e-5
    num_labels = 5
    
    # Regularization parameters
    dropout_prob = 0.1
    weight_decay = 0.01
    patience = 3
    min_delta = 0.001
    
    # Class names
    class_names = [
        'suicidal-thoughts-and-self-harm',
        'anxiety',
        'depression',
        'relationship-and-family-issues',
        'ptsd-and-trauma'
    ]
    
    # Path for best model only
    model_save_path = output_dir / 'best_model.pth'
    
    # Class weights (will be calculated)
    class_weights = None

    checkpoint_dir = output_dir / 'checkpoints'
    checkpoint_freq = 1  # Save checkpoint every N epochs
    resume_training = True  # Whether to resume from checkpoint if available
    
    def get_checkpoint_path(self, epoch):
        return self.checkpoint_dir / f'checkpoint_epoch_{epoch}.pt'

config = Config()

In [15]:
class CheckpointHandler:
    def __init__(self, config):
        self.config = config
        self.config.checkpoint_dir.mkdir(exist_ok=True)
        
    def save_checkpoint(self, epoch, model, optimizer, scheduler, 
                       train_losses, val_losses, best_val_f1):
        """Save a checkpoint of the training state"""
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_losses': train_losses,
            'val_losses': val_losses,
            'best_val_f1': best_val_f1
        }
        torch.save(checkpoint, self.config.get_checkpoint_path(epoch))
        
        # Save latest checkpoint reference
        latest_checkpoint = {
            'latest_epoch': epoch
        }
        torch.save(latest_checkpoint, 
                  self.config.checkpoint_dir / 'latest_checkpoint.pt')
        
    def load_latest_checkpoint(self):
        """Load the latest checkpoint if it exists"""
        latest_path = self.config.checkpoint_dir / 'latest_checkpoint.pt'
        if not latest_path.exists():
            return None
            
        latest = torch.load(latest_path)
        checkpoint_path = self.config.get_checkpoint_path(latest['latest_epoch'])
        
        if not checkpoint_path.exists():
            return None
            
        return torch.load(checkpoint_path)
        
    def clean_old_checkpoints(self, current_epoch):
        """Remove checkpoints older than the last 2 epochs"""
        for path in self.config.checkpoint_dir.glob('checkpoint_epoch_*.pt'):
            epoch = int(path.stem.split('_')[-1])
            if epoch < current_epoch - 2:
                path.unlink()


In [16]:
# Cell 3: Data Preparation
def clean_text(text):
    """
    Enhanced text cleaning function that removes stop words, special characters,
    and unnecessary content.
    
    Args:
        text (str): Input text to be cleaned
        
    Returns:
        str: Cleaned text
    """
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove phone numbers
    text = re.sub(r'\+?[\d\s-]{10,}', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\[Post removed at request of member\]\n?', '', text)
    text = re.sub(r'\n', ' ', text)      # Remove newlines
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Get stop words
    stop_words = set(stopwords.words('english'))
    
    # Additional domain-specific words to remove (customize as needed)
    additional_stop_words = {
        'please', 'help', 'think', 'know', 'like', 'would', 'could', 
        'may', 'also', 'many', 'much', 'trying', 'try', 'sure', 
        'way', 'even', 'really', 'lot', 'back', 'since', 'around',
        'still', 'time', 'always', 'never', 'want', 'wanted', 'needs',
        'need', 'feel', 'feeling', 'felt'
    }
    
    stop_words.update(additional_stop_words)
    
    # Remove stop words and keep only words with length > 2
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    # Join tokens back into text
    cleaned_text = ' '.join(tokens)
    
    # Limit text length (optional, adjust as needed)
    return cleaned_text[:2000]

def prepare_data():
    """
    Enhanced data preparation function with improved text cleaning
    """
    # Load data
    train_df = pd.read_csv(config.input_dir/'train.csv')
    test_df = pd.read_csv(config.input_dir/'test.csv')
    
    # Handle missing content and apply enhanced cleaning
    for df in [train_df, test_df]:
        # Fill missing content with title or [MISSING]
        df['content'] = df['content'].fillna(df['title'].fillna('[MISSING]'))
        
        # Clean content and title separately
        df['content_cleaned'] = df['content'].apply(clean_text)
        df['title_cleaned'] = df['title'].fillna('').apply(clean_text)
        
        # Combine cleaned content and title
        df['text'] = df['content_cleaned'] + " " + df['title_cleaned']
        
        # Remove any double spaces that might have been created
        df['text'] = df['text'].str.strip().replace(r'\s+', ' ', regex=True)
        
        # Drop temporary columns
        df.drop(['content_cleaned', 'title_cleaned'], axis=1, inplace=True)
    
    # Calculate class weights (rest of the function remains the same)
    labels = train_df['target'].map(config.class_names.index)
    config.class_weights = torch.tensor(
        compute_class_weight('balanced', classes=np.unique(labels), y=labels),
        dtype=torch.float32
    )
    
    # Split data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_df['text'],
        labels.tolist(),
        test_size=0.2,
        stratify=train_df['target'],
        random_state=42
    )
    
    return (train_texts.tolist(), train_labels,
            val_texts.tolist(), val_labels,
            test_df['text'].tolist())

In [17]:
class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        # RoBERTa uses the same encoding process
        encoding = self.tokenizer.encode_plus(
            str(self.texts[idx]),
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [18]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0



def train():
    print("Checking files...")
    assert (config.input_dir/'train.csv').exists(), "train.csv missing!"
    assert (config.input_dir/'test.csv').exists(), "test.csv missing!"
    
    # Initialize checkpoint handler
    checkpoint_handler = CheckpointHandler(config)
    
    # Load and prepare data
    print("Loading and preparing data...")
    train_df = pd.read_csv(config.input_dir/'train.csv')
    
    # Clean text data
    train_df['text'] = train_df.apply(
        lambda x: clean_text(str(x['content']) + " " + str(x['title'])),
        axis=1
    )
    
    # Split data with stratification
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_df['text'],
        train_df['target'].map(config.class_names.index),
        test_size=0.15,
        stratify=train_df['target'],
        random_state=42
    )
    
    # Initialize tokenizer and datasets
    tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
    
    train_dataset = MentalHealthDataset(
        train_texts.tolist(),
        train_labels.tolist(),
        tokenizer,
        config.max_length
    )
    val_dataset = MentalHealthDataset(
        val_texts.tolist(),
        val_labels.tolist(),
        tokenizer,
        config.max_length
    )
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.train_batch_size,
        shuffle=True
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.valid_batch_size
    )
    
    # Initialize model
    model = RobertaForSequenceClassification.from_pretrained(
        config.model_name,
        num_labels=config.num_labels,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob
    )
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Calculate class weights
    class_weights = torch.tensor(
        compute_class_weight('balanced', 
                           classes=np.unique(train_labels),
                           y=train_labels),
        dtype=torch.float32
    )
    
    # Initialize optimizer and scheduler
    optimizer = AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay
    )
    
    total_steps = len(train_loader) * config.epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )
    
    # Initialize training state variables
    start_epoch = 0
    train_losses = []
    val_losses = []
    best_val_f1 = 0
    
    # Load checkpoint if available and resume_training is True
    if config.resume_training:
        checkpoint = checkpoint_handler.load_latest_checkpoint()
        if checkpoint is not None:
            print(f"Resuming from epoch {checkpoint['epoch']}")
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            start_epoch = checkpoint['epoch']
            train_losses = checkpoint['train_losses']
            val_losses = checkpoint['val_losses']
            best_val_f1 = checkpoint['best_val_f1']
    
    criterion = torch.nn.CrossEntropyLoss(
        weight=class_weights.to(model.device)
    )
    
    early_stopping = EarlyStopping(
        patience=config.patience,
        min_delta=config.min_delta
    )
    
    # Training loop
    for epoch in range(start_epoch, config.epochs):
        print(f"\nEpoch {epoch+1}/{config.epochs}")
        print("-" * 30)
        
        # Training phase
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc="Training"):
            inputs = {k: v.to(model.device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = criterion(outputs.logits, inputs['labels'])
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            
            total_loss += loss.item()
        
        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        val_preds = []
        val_labels_list = []
        val_loss = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating"):
                inputs = {k: v.to(model.device) for k, v in batch.items()}
                outputs = model(**inputs)
                loss = criterion(outputs.logits, inputs['labels'])
                val_loss += loss.item()
                
                val_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
                val_labels_list.extend(inputs['labels'].cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        # Calculate metrics
        val_acc = accuracy_score(val_labels_list, val_preds)
        val_f1 = f1_score(val_labels_list, val_preds, average='macro')
        
        print(f"Train Loss: {avg_train_loss:.4f}")
        print(f"Val Loss: {avg_val_loss:.4f}")
        print(f"Val Accuracy: {val_acc:.4f}")
        print(f"Val F1-score: {val_f1:.4f}")
        
        # Save checkpoint if needed
        if (epoch + 1) % config.checkpoint_freq == 0:
            checkpoint_handler.save_checkpoint(
                epoch + 1,
                model,
                optimizer,
                scheduler,
                train_losses,
                val_losses,
                best_val_f1
            )
            print(f"Checkpoint saved for epoch {epoch + 1}")
            
            # Clean old checkpoints
            checkpoint_handler.clean_old_checkpoints(epoch + 1)
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), config.model_save_path)
            print("New best model saved!")
        
        # Early stopping check
        early_stopping(avg_val_loss)
        if early_stopping.early_stop:
            print("Early stopping triggered!")
            break
    
    # Plot learning curves
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Learning Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(config.output_dir / 'learning_curves.png')
    plt.close()


In [19]:
# Updated prediction function
def predict():
    # Load best model
    model = RobertaForSequenceClassification.from_pretrained(
        config.model_name,
        num_labels=config.num_labels
    )
    model.load_state_dict(torch.load(config.model_save_path))
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Prepare test data
    tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
    _, _, _, _, test_texts = prepare_data()
    
    # Create dataset
    test_dataset = MentalHealthDataset(test_texts, [0]*len(test_texts), tokenizer, config.max_length)
    test_loader = DataLoader(test_dataset, batch_size=config.valid_batch_size)
    
    # Predict
    model.eval()
    predictions = []
    for batch in tqdm(test_loader, desc="Predicting"):
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        with torch.no_grad():
            outputs = model(**inputs)
        predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
    
    # Save results
    test_df = pd.read_csv(config.input_dir/'test.csv')
    submission = pd.DataFrame({
        'id': test_df['id'],
        'target': [config.class_names[p] for p in predictions]
    })
    submission.to_csv(config.output_dir/'submission.csv', index=False)
    print("Predictions saved to submission.csv")



In [None]:
# Execution
if __name__ == '__main__':
    config.output_dir.mkdir(exist_ok=True)
    train()
    predict()

Checking files...
Loading and preparing data...
