In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
import datetime
import spacy
from nltk.corpus import wordnet
import nltk

# Download required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

class MentalHealthDataGenerator:
    def __init__(self):
        # Load spaCy for text processing
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            # If spaCy model is not installed, download it
            import os
            os.system('python -m spacy download en_core_web_sm')
            self.nlp = spacy.load('en_core_web_sm')

        # Mental health related patterns and phrases
        self.patterns = {
            'depression': {
                'idioms': [
                    "under a dark cloud", "down in the dumps", "at rock bottom",
                    "in a black hole", "carrying the weight of the world",
                    "fighting inner demons", "lost in darkness"
                ],
                'phrases': [
                    "I feel so {}", "can't seem to {}", "everything feels {}",
                    "no energy to {}", "struggling to {}", "lost interest in {}"
                ],
                'symptoms': [
                    "insomnia", "oversleeping", "loss of appetite", "overeating",
                    "constant fatigue", "difficulty concentrating", "feeling worthless",
                    "guilt", "physical pain", "suicidal thoughts"
                ],
                'emotions': [
                    "hopeless", "empty", "worthless", "guilty", "numb",
                    "sad", "miserable", "exhausted", "lonely", "defeated"
                ]
            },
            'anxiety': {
                'idioms': [
                    "on edge", "bundle of nerves", "butterflies in stomach",
                    "jumping out of skin", "mind racing", "heart in mouth",
                    "walking on eggshells"
                ],
                'phrases': [
                    "can't stop {}", "worried about {}", "what if {}",
                    "feeling overwhelmed by {}", "scared that {}", "nervous about {}"
                ],
                'symptoms': [
                    "racing heart", "sweating", "trembling", "chest pain",
                    "shortness of breath", "dizziness", "nausea", "panic attacks",
                    "restlessness", "muscle tension"
                ],
                'emotions': [
                    "anxious", "worried", "panicked", "stressed", "overwhelmed",
                    "fearful", "nervous", "uneasy", "tense", "restless"
                ]
            },
            'normal': {
                'idioms': [
                    "on cloud nine", "in high spirits", "full of beans",
                    "bright eyed and bushy tailed", "right as rain",
                    "feeling on top of the world"
                ],
                'phrases': [
                    "enjoying {}", "grateful for {}", "happy about {}",
                    "looking forward to {}", "excited about {}", "blessed with {}"
                ],
                'emotions': [
                    "happy", "content", "peaceful", "energetic", "motivated",
                    "grateful", "optimistic", "relaxed", "balanced", "satisfied"
                ]
            }
        }

        # Social media elements
        self.social_media_elements = {
            'hashtags': {
                'depression': ['#depression', '#mentalhealth', '#depressed', '#anxiety',
                             '#mentalhealthawareness', '#sad', '#depressing'],
                'anxiety': ['#anxiety', '#panic', '#stress', '#mentalhealth',
                           '#anxious', '#worried', '#overthinking'],
                'normal': ['#blessed', '#grateful', '#happy', '#positive',
                          '#goodvibes', '#motivation', '#peace']
            },
            'emojis': {
                'depression': ['😢', '😔', '😪', '💔', '😕', '😿'],
                'anxiety': ['😰', '😨', '😱', '😖', '😣', '😩'],
                'normal': ['😊', '😄', '🙏', '💪', '✨', '🌟']
            }
        }

    def create_post(self, condition):
        """Create a synthetic social media post"""
        patterns = self.patterns[condition]

        # Base structure
        components = []

        # Add idiom
        if random.random() < 0.3:
            components.append(random.choice(patterns['idioms']))

        # Add main phrase
        if 'phrases' in patterns:
            phrase = random.choice(patterns['phrases'])
            emotion = random.choice(patterns['emotions'])
            components.append(phrase.format(emotion))

        # Add symptom for mental health conditions
        if condition != 'normal' and random.random() < 0.4:
            components.append(f"Experiencing {random.choice(patterns['symptoms'])}")

        base_text = " ".join(components)

        # Add social media elements
        if random.random() < 0.4:
            hashtags = random.sample(
                self.social_media_elements['hashtags'][condition],
                k=random.randint(1, 3)
            )
            base_text += " " + " ".join(hashtags)

        if random.random() < 0.3:
            emojis = random.sample(
                self.social_media_elements['emojis'][condition],
                k=random.randint(1, 2)
            )
            base_text += " " + "".join(emojis)

        return base_text

    def generate_dataset(self, n_samples=10000):
        """Generate complete dataset"""
        data = []
        current_time = datetime.datetime.now()

        for _ in range(n_samples):
            condition = random.choice(['depression', 'anxiety', 'normal'])
            text = self.create_post(condition)

            # Generate timestamp
            random_days = random.randint(0, 365)
            timestamp = current_time - datetime.timedelta(days=random_days)

            # Generate engagement metrics
            base_engagement = np.random.normal(50, 20)
            engagement_multiplier = random.uniform(0.8, 1.2)
            likes = max(0, int(base_engagement * engagement_multiplier))
            shares = max(0, int(likes * random.uniform(0.1, 0.3)))

            # Create entry
            entry = {
                'text': text,
                'condition': condition,
                'timestamp': timestamp,
                'likes': likes,
                'shares': shares,
                'word_count': len(text.split()),
                'char_count': len(text),
                'has_hashtags': '#' in text,
                'hashtag_count': text.count('#'),
                'has_emojis': any(emoji in text for emoji in ''.join(
                    sum(self.social_media_elements['emojis'].values(), [])
                ))
            }

            data.append(entry)

        return pd.DataFrame(data)

def prepare_dataset():
    """Prepare and split dataset"""
    generator = MentalHealthDataGenerator()
    df = generator.generate_dataset()

    # Convert condition to numeric labels
    condition_map = {'normal': 0, 'anxiety': 1, 'depression': 2}
    df['label'] = df['condition'].map(condition_map)

    # Add time-based features
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek

    # Calculate engagement ratio
    df['engagement_ratio'] = (df['likes'] + df['shares']) / (df['word_count'] + 1)

    # Print dataset statistics
    print("\nDataset Statistics:")
    print(f"Total samples: {len(df)}")
    print("\nClass distribution:")
    print(df['condition'].value_counts())

    # Split dataset
    train_df, temp_df = train_test_split(
        df,
        test_size=0.3,
        random_state=42,
        stratify=df['label']
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,
        random_state=42,
        stratify=temp_df['label']
    )

    print("\nDataset splits:")
    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(val_df)}")
    print(f"Test samples: {len(test_df)}")

    return train_df, val_df, test_df, list(condition_map.keys())

if __name__ == "__main__":
    train_df, val_df, test_df, classes = prepare_dataset()

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...

Dataset Statistics:
Total samples: 10000

Class distribution:
condition
anxiety       3366
depression    3348
normal        3286
Name: count, dtype: int64

Dataset splits:
Training samples: 7000
Validation samples: 1500
Test samples: 1500


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from tqdm import tqdm
import warnings
import math
warnings.filterwarnings('ignore')

class MentalHealthDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Prepare numerical features
        numerical_features = [
            'likes', 'shares', 'word_count', 'char_count',
            'hashtag_count', 'hour', 'day_of_week', 'engagement_ratio',
            'has_hashtags', 'has_emojis'
        ]

        # Normalize numerical features safely
        features = df[numerical_features].values
        features = features.astype(np.float32)  # Convert to float32
        mean = np.nanmean(features, axis=0)
        std = np.nanstd(features, axis=0)
        std[std == 0] = 1  # Prevent division by zero
        self.features = (features - mean) / std

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'features': torch.tensor(self.features[idx], dtype=torch.float32),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class MentalHealthClassifier(nn.Module):
    def __init__(self, n_classes=3, dropout_rate=0.3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

        # Freeze BERT layers except last 2
        for param in self.bert.parameters():
            param.requires_grad = False

        for layer in self.bert.transformer.layer[-2:]:
            for param in layer.parameters():
                param.requires_grad = True

        self.text_features = nn.Sequential(
            nn.Linear(768, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.numerical_features = nn.Sequential(
            nn.Linear(10, 32),
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(dropout_rate/2),
            nn.Linear(32, 16),
            nn.LayerNorm(16),
            nn.ReLU()
        )

        self.classifier = nn.Sequential(
            nn.Linear(144, 64),  # 128 + 16 = 144
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, n_classes)
        )

    def forward(self, input_ids, attention_mask, features):
        # Process text through BERT
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_features(bert_output.last_hidden_state[:, 0])

        # Process numerical features
        numerical_output = self.numerical_features(features)

        # Combine features
        combined = torch.cat([text_features, numerical_output], dim=1)

        return self.classifier(combined)

def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    progress_bar = tqdm(train_loader, desc='Training')

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        features = batch['features'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, features)
        loss = criterion(outputs, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        progress_bar.set_postfix({
            'loss': total_loss / (progress_bar.n + 1),
            'acc': 100. * correct / total
        })

    return total_loss / len(train_loader), 100. * correct / total

def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            features = batch['features'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, features)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = 100. * correct / total
    avg_loss = total_loss / len(data_loader)

    return {
        'loss': avg_loss,
        'accuracy': accuracy,
        'predictions': predictions,
        'true_labels': true_labels
    }

def train_model(train_df, val_df, test_df, classes, num_epochs=10):
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Initialize tokenizer and model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = MentalHealthClassifier(len(classes)).to(device)

    # Create datasets and dataloaders
    train_dataset = MentalHealthDataset(train_df, tokenizer)
    val_dataset = MentalHealthDataset(val_df, tokenizer)
    test_dataset = MentalHealthDataset(test_df, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Initialize loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW([
        {'params': model.bert.parameters(), 'lr': 1e-5},
        {'params': model.text_features.parameters(), 'lr': 2e-4},
        {'params': model.numerical_features.parameters(), 'lr': 2e-4},
        {'params': model.classifier.parameters(), 'lr': 2e-4}
    ], weight_decay=0.01)

    # Training loop
    best_val_acc = 0
    patience = 3
    patience_counter = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    try:
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")

            # Train
            train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)

            # Validate
            val_metrics = evaluate(model, val_loader, criterion, device)

            # Store metrics
            history['train_loss'].append(train_loss)
            history['train_acc'].append(train_acc)
            history['val_loss'].append(val_metrics['loss'])
            history['val_acc'].append(val_metrics['accuracy'])

            print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
            print(f"Val Loss: {val_metrics['loss']:.4f} | Val Acc: {val_metrics['accuracy']:.2f}%")

            # Save best model
            if val_metrics['accuracy'] > best_val_acc:
                best_val_acc = val_metrics['accuracy']
                torch.save(model.state_dict(), 'best_mental_health_model.pth')
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print("Early stopping triggered")
                break

    except Exception as e:
        print(f"Error during training: {str(e)}")
        raise

    # Load best model and evaluate on test set
    model.load_state_dict(torch.load('best_mental_health_model.pth'))
    test_metrics = evaluate(model, test_loader, criterion, device)

    print("\nTest Results:")
    print(f"Test Loss: {test_metrics['loss']:.4f}")
    print(f"Test Accuracy: {test_metrics['accuracy']:.2f}%")

    return model, history, test_metrics

if __name__ == "__main__":
    # Get data from Part 1
    train_df, val_df, test_df, classes = prepare_dataset()

    # Train model
    model, history, test_metrics = train_model(train_df, val_df, test_df, classes)



Dataset Statistics:
Total samples: 10000

Class distribution:
condition
anxiety       3404
normal        3317
depression    3279
Name: count, dtype: int64

Dataset splits:
Training samples: 7000
Validation samples: 1500
Test samples: 1500
Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


Epoch 1/10


Training: 100%|██████████| 438/438 [00:26<00:00, 16.58it/s, loss=0.173, acc=94.6]


Train Loss: 0.1730 | Train Acc: 94.63%
Val Loss: 0.0060 | Val Acc: 100.00%

Epoch 2/10


Training: 100%|██████████| 438/438 [00:25<00:00, 17.17it/s, loss=0.00789, acc=100]


Train Loss: 0.0079 | Train Acc: 100.00%
Val Loss: 0.0015 | Val Acc: 100.00%

Epoch 3/10


Training: 100%|██████████| 438/438 [00:25<00:00, 17.24it/s, loss=0.00302, acc=100]


Train Loss: 0.0030 | Train Acc: 100.00%
Val Loss: 0.0006 | Val Acc: 100.00%

Epoch 4/10


Training: 100%|██████████| 438/438 [00:25<00:00, 17.22it/s, loss=0.00164, acc=100]


Train Loss: 0.0016 | Train Acc: 100.00%
Val Loss: 0.0003 | Val Acc: 100.00%
Early stopping triggered

Test Results:
Test Loss: 0.0060
Test Accuracy: 100.00%


In [4]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import pandas as pd
from typing import Dict, List, Tuple, Union
import plotly.express as px
import plotly.graph_objects as go
import json
import logging
from datetime import datetime

class MentalHealthPredictor:
    def __init__(self, model_path: str = 'best_mental_health_model.pth', device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = MentalHealthClassifier().to(self.device)
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()
        self.classes = ['normal', 'anxiety', 'depression']

        logging.basicConfig(
            filename=f'mental_health_predictions_{datetime.now().strftime("%Y%m%d")}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def preprocess_text(self, text: str) -> Dict[str, torch.Tensor]:
        """Preprocess input text for prediction"""
        try:
            # Tokenize text
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=128,
                return_tensors='pt'
            )

            # Create dummy numerical features with correct dimensions
            features = torch.zeros((1, 10), dtype=torch.float32)  # Changed to 2D tensor

            return {
                'input_ids': encoding['input_ids'].to(self.device),
                'attention_mask': encoding['attention_mask'].to(self.device),
                'features': features.to(self.device)
            }
        except Exception as e:
            self.logger.error(f"Error in preprocessing text: {str(e)}")
            raise

    def predict(self, text: str) -> Dict:
        """Make prediction for input text"""
        try:
            self.logger.info(f"Processing text: {text}")

            inputs = self.preprocess_text(text)

            with torch.no_grad():
                outputs = self.model(**inputs)
                probabilities = torch.softmax(outputs, dim=1)
                prediction = torch.argmax(probabilities, dim=1)

            predicted_class = self.classes[prediction.item()]
            probabilities = probabilities.squeeze().cpu().numpy()

            result = {
                'text': text,
                'prediction': predicted_class,
                'confidence': float(probabilities[prediction.item()]),
                'probabilities': {
                    class_name: float(prob)
                    for class_name, prob in zip(self.classes, probabilities)
                }
            }

            self.logger.info(f"Prediction result: {json.dumps(result, indent=2)}")
            return result

        except Exception as e:
            self.logger.error(f"Error in prediction: {str(e)}")
            raise

class ModelEvaluator:
    def __init__(self, model, test_loader, device, classes):
        self.model = model
        self.test_loader = test_loader
        self.device = device
        self.classes = classes

    def evaluate_model(self) -> Dict:
        """Perform comprehensive model evaluation"""
        self.model.eval()
        predictions = []
        true_labels = []
        probabilities = []

        with torch.no_grad():
            for batch in self.test_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                features = batch['features'].to(self.device)
                labels = batch['labels']

                outputs = self.model(input_ids, attention_mask, features)
                probs = torch.softmax(outputs, dim=1)
                _, preds = outputs.max(1)

                predictions.extend(preds.cpu().numpy())
                true_labels.extend(labels.numpy())
                probabilities.extend(probs.cpu().numpy())

        return self.calculate_metrics(predictions, true_labels, probabilities)

    def calculate_metrics(self, predictions: List, true_labels: List,
                         probabilities: List) -> Dict:
        """Calculate various evaluation metrics"""
        report = classification_report(true_labels, predictions,
                                    target_names=self.classes, output_dict=True)
        cm = confusion_matrix(true_labels, predictions)

        roc_curves = {}
        for i, class_name in enumerate(self.classes):
            fpr, tpr, _ = roc_curve(
                [1 if label == i else 0 for label in true_labels],
                [prob[i] for prob in probabilities]
            )
            roc_curves[class_name] = {
                'fpr': fpr.tolist(),
                'tpr': tpr.tolist(),
                'auc': auc(fpr, tpr)
            }

        return {
            'classification_report': report,
            'confusion_matrix': cm,
            'roc_curves': roc_curves
        }

class ResultVisualizer:
    @staticmethod
    def plot_confusion_matrix(cm: np.ndarray, classes: List[str]):
        """Plot confusion matrix using plotly"""
        fig = go.Figure(data=go.Heatmap(
            z=cm,
            x=classes,
            y=classes,
            colorscale='Viridis',
            text=cm,
            texttemplate="%{text}",
            textfont={"size": 16},
            hoverongaps=False))

        fig.update_layout(
            title='Confusion Matrix',
            xaxis_title='Predicted Label',
            yaxis_title='True Label',
            width=600,
            height=500
        )

        return fig

    @staticmethod
    def plot_roc_curves(roc_curves: Dict):
        """Plot ROC curves for all classes"""
        fig = go.Figure()

        for class_name, curve in roc_curves.items():
            fig.add_trace(go.Scatter(
                x=curve['fpr'],
                y=curve['tpr'],
                name=f'{class_name} (AUC = {curve["auc"]:.3f})',
                mode='lines'
            ))

        fig.add_trace(go.Scatter(
            x=[0, 1],
            y=[0, 1],
            name='Random',
            mode='lines',
            line=dict(dash='dash', color='gray')
        ))

        fig.update_layout(
            title='ROC Curves',
            xaxis_title='False Positive Rate',
            yaxis_title='True Positive Rate',
            width=700,
            height=500,
            showlegend=True
        )

        return fig

    @staticmethod
    def plot_prediction_distribution(predictions: List[Dict]):
        """Plot distribution of predictions"""
        pred_classes = [p['prediction'] for p in predictions]
        confidences = [p['confidence'] for p in predictions]

        fig = go.Figure()

        # Add histogram for predictions
        fig.add_trace(go.Histogram(
            x=pred_classes,
            name='Predictions'
        ))

        # Add box plot for confidences
        fig.add_trace(go.Box(
            y=confidences,
            x=pred_classes,
            name='Confidence'
        ))

        fig.update_layout(
            title='Prediction Distribution and Confidence',
            xaxis_title='Predicted Class',
            yaxis_title='Count / Confidence',
            width=800,
            height=500
        )

        return fig

def batch_predict(texts: List[str], predictor: MentalHealthPredictor) -> List[Dict]:
    """Perform batch prediction on multiple texts"""
    results = []
    for text in texts:
        try:
            result = predictor.predict(text)
            results.append(result)
            print(f"\nText: {text}")
            print(f"Predicted Condition: {result['prediction']}")
            print(f"Confidence: {result['confidence']:.2%}")
            print("\nProbabilities:")
            for condition, prob in result['probabilities'].items():
                print(f"{condition}: {prob:.2%}")
            print("-" * 80)
        except Exception as e:
            print(f"Error processing text: {text}")
            print(f"Error message: {str(e)}")
    return results

def main():
    try:
        # Initialize predictor
        predictor = MentalHealthPredictor()

        # Example texts for testing
        test_texts = [
            "Feeling really overwhelmed with work and can't stop worrying about deadlines #stress",
            "i am going to temple ✨",
            "Nothing matters anymore. Can't remember the last time I felt happy...",
            "Anxiety is through the roof today. Heart won't stop racing 😰",
            "Making progress on my goals and feeling positive about the future!"
        ]

        # Make predictions
        results = batch_predict(test_texts, predictor)

        # Create visualizations
        if results:
            visualizer = ResultVisualizer()
            dist_fig = visualizer.plot_prediction_distribution(results)
            dist_fig.show()

    except Exception as e:
        print(f"An error occurred in main: {str(e)}")

if __name__ == "__main__":
    main()



Text: Feeling really overwhelmed with work and can't stop worrying about deadlines #stress
Predicted Condition: anxiety
Confidence: 99.21%

Probabilities:
normal: 0.28%
anxiety: 99.21%
depression: 0.51%
--------------------------------------------------------------------------------

Text: i am going to temple ✨
Predicted Condition: normal
Confidence: 99.22%

Probabilities:
normal: 99.22%
anxiety: 0.32%
depression: 0.46%
--------------------------------------------------------------------------------

Text: Nothing matters anymore. Can't remember the last time I felt happy...
Predicted Condition: depression
Confidence: 99.34%

Probabilities:
normal: 0.36%
anxiety: 0.31%
depression: 99.34%
--------------------------------------------------------------------------------

Text: Anxiety is through the roof today. Heart won't stop racing 😰
Predicted Condition: anxiety
Confidence: 99.05%

Probabilities:
normal: 0.29%
anxiety: 99.05%
depression: 0.66%
----------------------------------------

In [5]:
!pip install gradio transformers torch


Collecting gradio
  Downloading gradio-5.14.0-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.0 (from gradio)
  Downloading gradio_client-1.7.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.1

In [6]:
import torch
from transformers import DistilBertTokenizer
import gradio as gr
import numpy as np
from datetime import datetime
import logging
import json

class MentalHealthPredictorInteractive:
    def __init__(self, model_path: str = 'best_mental_health_model.pth'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = MentalHealthClassifier().to(self.device)
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()
        self.classes = ['normal', 'anxiety', 'depression']

        # Setup logging
        logging.basicConfig(
            filename=f'predictions_{datetime.now().strftime("%Y%m%d")}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def predict_text(self, text: str) -> dict:
        """Predict mental health condition from input text"""
        try:
            # Tokenize text
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=128,
                return_tensors='pt'
            )

            # Prepare features
            features = torch.zeros((1, 10), dtype=torch.float32)

            # Move to device
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)
            features = features.to(self.device)

            # Make prediction
            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask, features)
                probabilities = torch.softmax(outputs, dim=1)
                prediction = torch.argmax(probabilities, dim=1)

            # Get results
            predicted_class = self.classes[prediction.item()]
            probabilities = probabilities.squeeze().cpu().numpy()

            # Format results
            result = {
                'prediction': predicted_class,
                'confidence': float(probabilities[prediction.item()]),
                'probabilities': {
                    class_name: float(prob)
                    for class_name, prob in zip(self.classes, probabilities)
                }
            }

            return result

        except Exception as e:
            self.logger.error(f"Error in prediction: {str(e)}")
            return {'error': str(e)}

def create_gradio_interface():
    """Create Gradio interface for interactive predictions"""
    predictor = MentalHealthPredictorInteractive()

    def predict_and_format(text):
        result = predictor.predict_text(text)

        if 'error' in result:
            return f"Error: {result['error']}"

        # Format output
        output = f"Predicted Condition: {result['prediction']}\n"
        output += f"Confidence: {result['confidence']:.2%}\n\n"
        output += "Probabilities:\n"
        for condition, prob in result['probabilities'].items():
            output += f"{condition}: {prob:.2%}\n"

        return output

    # Create Gradio interface
    iface = gr.Interface(
        fn=predict_and_format,
        inputs=gr.Textbox(
            lines=3,
            placeholder="Enter your text here..."
        ),
        outputs=gr.Textbox(),
        title="Mental Health Text Analysis",
        description="Enter text to analyze the emotional content and predict mental health indicators.",
        examples=[
            ["Feeling really overwhelmed with work and can't stop worrying about deadlines"],
            ["Had a great day at the beach with friends! So grateful for these moments"],
            ["Nothing matters anymore. Can't remember the last time I felt happy"],
            ["Making progress on my goals and feeling positive about the future!"]
        ]
    )

    return iface

def manual_prediction():
    """Function for manual text input and prediction"""
    predictor = MentalHealthPredictorInteractive()

    print("\nMental Health Text Analysis")
    print("=" * 50)
    print("Enter 'quit' to exit")
    print("-" * 50)

    while True:
        # Get input
        text = input("\nEnter text to analyze: ").strip()

        if text.lower() == 'quit':
            break

        if not text:
            print("Please enter some text to analyze.")
            continue

        # Make prediction
        result = predictor.predict_text(text)

        if 'error' in result:
            print(f"\nError: {result['error']}")
            continue

        # Print results
        print("\nResults:")
        print("-" * 20)
        print(f"Predicted Condition: {result['prediction']}")
        print(f"Confidence: {result['confidence']:.2%}")
        print("\nProbabilities:")
        for condition, prob in result['probabilities'].items():
            print(f"{condition}: {prob:.2%}")

        print("\n" + "=" * 50)

if __name__ == "__main__":
    print("Choose interface mode:")
    print("1. Command Line Interface")
    print("2. Web Interface (Gradio)")

    choice = input("Enter your choice (1 or 2): ").strip()

    if choice == "1":
        manual_prediction()
    elif choice == "2":
        iface = create_gradio_interface()
        iface.launch()
    else:
        print("Invalid choice. Please run again and select 1 or 2.")


Choose interface mode:
1. Command Line Interface
2. Web Interface (Gradio)


Enter your choice (1 or 2):  2


* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://412a79c59e6d7c67d8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Created dataset file at: .gradio/flagged/dataset1.csv
