In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    log_loss,
    f1_score,
    roc_auc_score,
)
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# --- Configuration ---
# Define paths to the data files.
TRAIN_CSV_PATH = '/content/jigsaw-toxic-comment-train.csv'

# --- Data Loading and Preparation ---
def create_dummy_data():
    """Creates dummy data file for local testing if it doesn't exist.
    This simulates the Kaggle environment.
    """
    print("Creating dummy data for local execution...")
    if not os.path.exists(os.path.dirname(TRAIN_CSV_PATH)):
        os.makedirs(os.path.dirname(TRAIN_CSV_PATH))

    # Create dummy train.csv
    if not os.path.exists(TRAIN_CSV_PATH):
        train_data = {
            'comment_text': [
                "This is a toxic comment.",
                "I love this product!",
                "You are an idiot.",
                "What a beautiful day.",
                "Go kill yourself.",
                "Hello world.",
                "This is terrible.",
                "Thank you for your help.",
                "You are the worst.",
                "Nice work!"
            ],
            'toxic': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
        }
        train_df = pd.DataFrame(train_data)
        train_df.to_csv(TRAIN_CSV_PATH, index=False)
        print(f"'{TRAIN_CSV_PATH}' created.")

def load_data(train_csv_path):
    """Loads the training data from the CSV.
    Args:
        train_csv_path (str): Path to the train.csv file.
    Returns:
        pandas.DataFrame: A DataFrame with the loaded data.
    """
    print("Loading data...")
    try:
        df = pd.read_csv(train_csv_path)
    except FileNotFoundError:
        print(f"Error: Training CSV not found at '{train_csv_path}'.")
        return None
    print("Data loading complete.")
    return df

# --- Main Execution Logic ---
if __name__ == "__main__":
    # Create dummy files if they don't exist (for local runs)
    if not os.path.exists(TRAIN_CSV_PATH):
        create_dummy_data()

    # 1. Load Data
    train_df = load_data(TRAIN_CSV_PATH)

    if train_df is not None:
        # Define features (X) and target (y)
        X = train_df['comment_text']
        y = train_df['toxic']

        # 2. Split Data into Training and Validation Sets
        print("Splitting data into training and validation sets (80/20 split)...")
        X_train, X_val, y_train, y_val = train_test_split(
            X, y,
            test_size=0.2,
            random_state=42,
            stratify=y  # Stratify to maintain class distribution in train/test splits
        )
        print(f"Training set size: {len(X_train)}")
        print(f"Validation set size: {len(X_val)}")

        # 3. Define the Model Pipeline
        print("Defining the model pipeline...")
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                ngram_range=(1, 2), # Use unigrams and bigrams
                max_features=20000, # Limit the number of features
                stop_words='english'
            )),
            ('clf', LogisticRegression(
                solver='liblinear', # Good for smaller datasets and L1/L2 regularization
                random_state=42,
                C=0.5 # Regularization strength
            ))
        ])

        # 4. Train the Model
        print("Training the model...")
        pipeline.fit(X_train, y_train)
        print("Model training complete.")

        # 5. Make Predictions on the Validation Set
        print("Making predictions on the validation set...")
        y_pred = pipeline.predict(X_val)
        y_pred_proba = pipeline.predict_proba(X_val)[:, 1] # Get probabilities for the positive class (toxic)
        print("Predictions complete.")

        # 6. Evaluate the Model
        print("\n--- Model Evaluation Results ---")
        # Calculate metrics
        accuracy = accuracy_score(y_val, y_pred)
        loss = log_loss(y_val, y_pred_proba)
        f1_macro = f1_score(y_val, y_pred, average='macro')
        f1_weighted = f1_score(y_val, y_pred, average='weighted')
        roc_auc_ovr = roc_auc_score(y_val, y_pred_proba) # For binary classification, roc_auc_score handles it directly

        # Create a DataFrame for a clean display of results
        results_df = pd.DataFrame({
            'Metric': [
                'Accuracy',
                'Log Loss',
                'F1 Score (Macro)',
                'F1 Score (Weighted)',
                'ROC AUC'
            ],
            'Score': [
                accuracy,
                loss,
                f1_macro,
                f1_weighted,
                roc_auc_ovr
            ]
        })
        print(results_df.to_string(index=False))
        print("--------------------------------\n")

Loading data...
Data loading complete.
Splitting data into training and validation sets (80/20 split)...
Training set size: 178839
Validation set size: 44710
Defining the model pipeline...
Training the model...
Model training complete.
Making predictions on the validation set...
Predictions complete.

--- Model Evaluation Results ---
             Metric    Score
           Accuracy 0.949631
           Log Loss 0.133818
   F1 Score (Macro) 0.822126
F1 Score (Weighted) 0.943912
            ROC AUC 0.966764
--------------------------------

