In [None]:
pip install pandas numpy faker

In [None]:
import pandas as pd
import numpy as np
import random
import uuid
from faker import Faker
from datetime import datetime, timedelta
import json
import os

class SchedulingBotDatasetGenerator:
    def __init__(self, num_records=10000):
        self.num_records = num_records
        self.fake = Faker()
        self.used_ids = set()

    def generate_unique_id(self):
        """Generate a unique ID"""
        while True:
            new_id = str(uuid.uuid4())
            if new_id not in self.used_ids:
                self.used_ids.add(new_id)
                return new_id

    def generate_conversation_dataset(self):
        """
        Generate comprehensive conversation dataset
        """
        conversation_data = []
        
        # Predefined lists for rich data generation
        intents = [
            'offer_availability', 
            'schedule_meeting', 
            'reschedule', 
            'cancel_meeting', 
            'request_time_slot'
        ]
        
        speakers = ['Candidate', 'Recruiter', 'Bot']
        timezones = ['UTC-5', 'UTC-8', 'UTC+1', 'UTC+5', 'UTC+8']
        
        # Advanced message templates
        message_templates = self._generate_message_templates()
        
        for _ in range(self.num_records):
            # Generate unique session and message IDs
            session_id = self.generate_unique_id()
            message_id = self.generate_unique_id()
            
            # Create conversation record
            conversation_record = {
                'Message_ID': message_id,
                'Session_ID': session_id,
                'Speaker': random.choice(speakers),
                'Timestamp': self._generate_timestamp(),
                'Raw_Text': random.choice(message_templates),
                'Parsed_Date': self._generate_date(),
                'Parsed_Time': self.fake.time(),
                'Intent_Label': random.choice(intents),
                'Entities': self._generate_entities(timezones),
                'Context': self.fake.sentence(),
                'Language': random.choice(['en', 'es', 'fr', 'de']),
                'Sentiment': random.choice(['positive', 'neutral', 'negative'])
            }
            
            conversation_data.append(conversation_record)
        
        return pd.DataFrame(conversation_data)

    def generate_calendar_dataset(self):
        """
        Generate comprehensive calendar dataset
        """
        calendar_data = []
        
        # Predefined lists for rich data generation
        meeting_types = [
            'Technical Interview', 
            'HR Screening', 
            'Final Round', 
            'Initial Discussion',
            'Follow-up Meeting',
            'Project Presentation'
        ]
        
        availability_statuses = [
            'Available', 'Booked', 'Tentative', 
            'Blocked', 'Pending Confirmation'
        ]
        
        locations = [
            'Zoom', 'Google Meet', 'Microsoft Teams', 
            'In-Person', 'Hybrid', 'Phone Call'
        ]
        
        for _ in range(self.num_records):
            # Generate unique IDs
            event_id = self.generate_unique_id()
            recruiter_id = self.generate_unique_id()
            candidate_id = self.generate_unique_id()
            
            # Generate timestamps
            start_time = self._generate_datetime()
            end_time = start_time + timedelta(minutes=random.randint(30, 120))
            
            # Create calendar record
            calendar_record = {
                'Event_ID': event_id,
                'Recruiter_ID': recruiter_id,
                'Candidate_ID': candidate_id,
                'Date': start_time.date(),
                'Start_Time': start_time.time(),
                'End_Time': end_time.time(),
                'Availability_Status': random.choice(availability_statuses),
                'Meeting_Duration': (end_time - start_time).total_seconds() / 60,
                'TimeZone': random.choice(['UTC-5', 'UTC-8', 'UTC+1', 'UTC+5']),
                'Meeting_Type': random.choice(meeting_types),
                'Invite_Status': random.choice([
                    'Sent', 'Pending', 'Accepted', 'Declined'
                ]),
                'Location': random.choice(locations),
                'Department': random.choice([
                    'Engineering', 'Sales', 'Marketing', 
                    'Product', 'Customer Success'
                ]),
                'Priority': random.choice(['High', 'Medium', 'Low'])
            }
            
            calendar_data.append(calendar_record)
        
        return pd.DataFrame(calendar_data)

    def _generate_message_templates(self):
        """
        Generate advanced message templates with contextual variations
        """
        templates = [
            # Availability Offers
            "I'm available next {day} between {start_time} and {end_time}",
            "My schedule is open on {day} from {start_time} to {end_time}",
            "I have free slots on {day} around {time_range}",
            
            # Interview Requests
            "Can we schedule a {duration} interview for {day}?",
            "I'm looking to book a {duration} meeting next week",
            "Would you have time for a {duration} discussion?",
            
            # Specific Scheduling Requests
            "I prefer morning/afternoon meetings on {day}",
            "Are there any open slots for a {meeting_type} next {day_period}?",
            "Looking for a {meeting_type} interview this week",
            
            # Rescheduling and Modifications
            "I need to reschedule our previous meeting",
            "Can we move our discussion to a different time?",
            "My availability has changed. Let's find a new slot.",
            
            # Time Zone and Flexibility Considerations
            "I'm in {timezone}. What times work best for you?",
            "Can we accommodate my {timezone} schedule?",
        ]
        
        def format_template(template):
            days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
            time_ranges = ['morning', 'afternoon', 'evening']
            start_times = ['9:00 AM', '10:30 AM', '2:00 PM', '3:30 PM']
            end_times = ['11:00 AM', '12:30 PM', '4:00 PM', '5:30 PM']
            durations = ['30-minute', '45-minute', '60-minute']
            meeting_types = ['Technical', 'HR', 'Initial Screening', 'Final Round']
            day_periods = ['week', 'weekend']
            timezones = ['EST', 'PST', 'CST', 'UTC']
            
            return template.format(
                day=random.choice(days),
                start_time=random.choice(start_times),
                end_time=random.choice(end_times),
                time_range=random.choice(time_ranges),
                duration=random.choice(durations),
                meeting_type=random.choice(meeting_types),
                day_period=random.choice(day_periods),
                timezone=random.choice(timezones)
            )
        
        return [format_template(template) for template in templates]

    def _generate_entities(self, timezones):
        """
        Generate structured entities for NLP processing
        """
        return json.dumps({
            'Duration': random.choice(['30 min', '45 min', '60 min']),
            'TimeZone': random.choice(timezones),
            'Day': random.choice(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']),
            'TimeSlot': random.choice(['Morning', 'Afternoon', 'Evening'])
        })

    def _generate_timestamp(self):
        """Generate a unique timestamp"""
        return self.fake.date_time_between(start_date='-1y', end_date='+1y')

    def _generate_date(self):
        """Generate a unique date"""
        return self.fake.date_between(start_date='-1y', end_date='+1y')

    def _generate_datetime(self):
        """Generate a unique datetime"""
        return self.fake.date_time_between(start_date='-1y', end_date='+1y')

    def save_datasets(self, output_dir='scheduling_bot_datasets'):
        """
        Save generated datasets to CSV
        """
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Generate datasets
        conversation_df = self.generate_conversation_dataset()
        calendar_df = self.generate_calendar_dataset()
        
        # Save paths
        conversation_path = os.path.join(output_dir, 'conversation_dataset.csv')
        calendar_path = os.path.join(output_dir, 'calendar_dataset.csv')
        
        # Save to CSV
        conversation_df.to_csv(conversation_path, index=False)
        calendar_df.to_csv(calendar_path, index=False)
        
        print(f"Conversation Dataset saved to: {conversation_path}")
        print(f"Calendar Dataset saved to: {calendar_path}")
        
        return conversation_df, calendar_df

def validate_dataset(df, dataset_type):
    """
    Validate generated dataset
    """
    print(f"\n{dataset_type} Dataset Validation:")
    
    # Check for missing values
    print("Missing Values:")
    print(df.isnull().sum())
    
    # Check unique IDs
    id_columns = {
        'Conversation': ['Message_ID', 'Session_ID'],
        'Calendar': ['Event_ID', 'Recruiter_ID', 'Candidate_ID']
    }
    
    for col in id_columns.get(dataset_type, []):
        unique_count = df[col].nunique()
        total_count = len(df)
        print(f"{col} Unique Check: {unique_count == total_count}")
    
    # Basic statistical overview
    print("\nDataset Overview:")
    print(df.info())
    
    # Distribution of categorical columns
    categorical_columns = {
        'Conversation': ['Speaker', 'Intent_Label', 'Language', 'Sentiment'],
        'Calendar': ['Meeting_Type', 'Availability_Status', 'Invite_Status', 'Location']
    }
    
    for col in categorical_columns.get(dataset_type, []):
        print(f"\n{col} Distribution:")
        print(df[col].value_counts(normalize=True))

def main():
    # Set random seed for reproducibility
    np.random.seed(42)
    random.seed(42)
    
    # Initialize dataset generator
    generator = SchedulingBotDatasetGenerator(num_records=10000)
    
    # Generate and save datasets
    conversation_df, calendar_df = generator.save_datasets()
    
    # Validate datasets
    validate_dataset(conversation_df, 'Conversation')
    validate_dataset(calendar_df, 'Calendar')

if __name__ == "__main__":
    main()

In [None]:
# Step 1: Initial Dataset Examination and Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Load the datasets
print("Step 1: Loading Datasets")
conversation_df = pd.read_csv('scheduling_bot_datasets/conversation_dataset.csv')
calendar_df = pd.read_csv('scheduling_bot_datasets/calendar_dataset.csv')

# Display basic information about the datasets
print("\nConversation Dataset Information:")
print(conversation_df.info())

print("\nCalendar Dataset Information:")
print(calendar_df.info())

# Step 2: Data Quality Check
print("\nStep 2: Data Quality Check")

# Check for missing values
print("\nMissing Values in Conversation Dataset:")
print(conversation_df.isnull().sum())

print("\nMissing Values in Calendar Dataset:")
print(calendar_df.isnull().sum())

# Check for duplicate IDs
print("\nDuplicate Message IDs in Conversation Dataset:")
print(conversation_df['Message_ID'].duplicated().sum())

print("\nDuplicate Event IDs in Calendar Dataset:")
print(calendar_df['Event_ID'].duplicated().sum())


In [None]:
conversation_df.head()

In [None]:
calendar_df.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import traceback

# Enhanced Logging Function
def log_error(e):
    """
    Log errors with detailed traceback
    """
    print("An error occurred:")
    print(str(e))
    print("\nDetailed Traceback:")
    traceback.print_exc()

# Step 1: Initial Dataset Examination and Loading
def load_and_examine_datasets():
    try:
        print("Step 1: Loading Datasets")
        
        # Load datasets with error handling
        try:
            conversation_df = pd.read_csv('scheduling_bot_datasets/conversation_dataset.csv')
            calendar_df = pd.read_csv('scheduling_bot_datasets/calendar_dataset.csv')
        except FileNotFoundError as e:
            print(f"Error: Dataset file not found. {e}")
            return None, None
        except pd.errors.EmptyDataError:
            print("Error: One or both datasets are empty.")
            return None, None
        
        # Display basic information about the datasets
        print("\nConversation Dataset Information:")
        print(conversation_df.info())

        print("\nCalendar Dataset Information:")
        print(calendar_df.info())
        
        return conversation_df, calendar_df
    
    except Exception as e:
        log_error(e)
        return None, None

# Step 2: Data Quality Check
def perform_data_quality_check(conversation_df, calendar_df):
    try:
        print("\nStep 2: Data Quality Check")

        # Check for missing values
        print("\nMissing Values in Conversation Dataset:")
        print(conversation_df.isnull().sum())

        print("\nMissing Values in Calendar Dataset:")
        print(calendar_df.isnull().sum())

        # Check for duplicate IDs
        print("\nDuplicate Message IDs in Conversation Dataset:")
        print(conversation_df['Message_ID'].duplicated().sum())

        print("\nDuplicate Event IDs in Calendar Dataset:")
        print(calendar_df['Event_ID'].duplicated().sum())
    
    except Exception as e:
        log_error(e)

# Step 3: Exploratory Data Analysis
def perform_exploratory_data_analysis(conversation_df, calendar_df):
    try:
        print("\nStep 3: Exploratory Data Analysis")

        # Ensure datetime conversion
        conversation_df['Timestamp'] = pd.to_datetime(conversation_df['Timestamp'])
        calendar_df['Date'] = pd.to_datetime(calendar_df['Date'])

        # Create a multi-plot figure for comprehensive visualization
        fig, axs = plt.subplots(2, 2, figsize=(20, 15))

        # Conversation Dataset Categorical Variables
        conversation_df['Intent_Label'].value_counts().plot(
            kind='bar', 
            ax=axs[0, 0], 
            title='Intent Label Distribution'
        )
        axs[0, 0].set_xticklabels(axs[0, 0].get_xticklabels(), rotation=45, ha='right')

        conversation_df['Speaker'].value_counts().plot(
            kind='pie', 
            ax=axs[0, 1], 
            autopct='%1.1f%%', 
            title='Speaker Distribution'
        )

        # Calendar Dataset Categorical Variables
        calendar_df['Meeting_Type'].value_counts().plot(
            kind='bar', 
            ax=axs[1, 0], 
            title='Meeting Type Distribution'
        )
        axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')

        calendar_df['Availability_Status'].value_counts().plot(
            kind='pie', 
            ax=axs[1, 1], 
            autopct='%1.1f%%', 
            title='Availability Status Distribution'
        )

        plt.tight_layout()
        plt.savefig('categorical_distribution.png')
        plt.close()

        # Temporal Analysis Figure
        fig, axs = plt.subplots(2, 2, figsize=(20, 15))

        # Conversations by Hour of Day
        conversation_df['Timestamp'].dt.hour.value_counts().sort_index().plot(
            kind='bar', 
            ax=axs[0, 0], 
            title='Conversations by Hour of Day'
        )
        axs[0, 0].set_xlabel('Hour')
        axs[0, 0].set_ylabel('Number of Conversations')

        # Meetings by Day of Week
        calendar_df['Date'].dt.day_name().value_counts().plot(
            kind='bar', 
            ax=axs[0, 1], 
            title='Meetings by Day of Week'
        )
        axs[0, 1].set_xticklabels(axs[0, 1].get_xticklabels(), rotation=45, ha='right')

        # Text Length Analysis
        conversation_df['Raw_Text'].str.len().plot(
            kind='hist', 
            bins=30, 
            ax=axs[1, 0], 
            title='Text Length Distribution'
        )
        axs[1, 0].set_xlabel('Text Length')
        axs[1, 0].set_ylabel('Frequency')

        # Meeting Duration Distribution
        calendar_df['Meeting_Duration'].plot(
            kind='hist', 
            bins=30, 
            ax=axs[1, 1], 
            title='Meeting Duration Distribution'
        )
        axs[1, 1].set_xlabel('Meeting Duration')
        axs[1, 1].set_ylabel('Frequency')

        plt.tight_layout()
        plt.savefig('temporal_analysis.png')
        plt.close()

    except Exception as e:
        log_error(e)

# Step 4: Data Preprocessing
def preprocess_data(conversation_df, calendar_df):
    try:
        print("\nStep 4: Data Preprocessing")

        # Preprocessing Conversation Dataset
        def preprocess_conversation_data(df):
            # Convert Timestamp
            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
            
            # Extract additional time-based features
            df['Hour_of_Day'] = df['Timestamp'].dt.hour
            df['Day_of_Week'] = df['Timestamp'].dt.day_name()
            df['Month'] = df['Timestamp'].dt.month_name()
            
            # Text-based features
            df['Text_Length'] = df['Raw_Text'].str.len()
            
            # Parse Entities with robust error handling
            def parse_entities(entities_str):
                try:
                    entities = json.loads(entities_str)
                    return pd.Series({
                        'Duration': entities.get('Duration', ''),
                        'TimeZone': entities.get('TimeZone', '')
                    })
                except (json.JSONDecodeError, TypeError):
                    return pd.Series({'Duration': '', 'TimeZone': ''})
            
            entities_df = df['Entities'].apply(parse_entities)
            df = pd.concat([df, entities_df], axis=1)
            
            return df

        # Preprocessing Calendar Dataset
        def preprocess_calendar_data(df):
            # Convert Date
            df['Date'] = pd.to_datetime(df['Date'])
            
            # Flexible time parsing
            def parse_time(time_str):
                try:
                    # Try multiple time formats
                    formats = ['%H:%M:%S', '%H:%M:%S.%f', '%I:%M %p', '%H:%M']
                    for fmt in formats:
                        try:
                            return pd.to_datetime(time_str, format=fmt).time()
                        except:
                            continue
                    return None
                except Exception as e:
                    print(f"Time parsing error: {e}")
                    return None

            # Parse Start and End Times
            df['Start_Time'] = df['Start_Time'].apply(parse_time)
            df['End_Time'] = df['End_Time'].apply(parse_time)
            
            # Calculate Meeting Duration
            def calculate_duration(row):
                try:
                    start = pd.to_datetime(row['Start_Time'].strftime('%H:%M:%S'))
                    end = pd.to_datetime(row['End_Time'].strftime('%H:%M:%S'))
                    
                    # Handle times crossing midnight
                    if end < start:
                        end += pd.Timedelta(days=1)
                    
                    return (end - start).total_seconds() / 60
                except Exception as e:
                    print(f"Duration calculation error: {e}")
                    return None

            df['Meeting_Duration_Minutes'] = df.apply(calculate_duration, axis=1)
            
            # Fill NaN durations with mean
            df['Meeting_Duration_Minutes'].fillna(
                df['Meeting_Duration_Minutes'].mean(), 
                inplace=True
            )
            
            # Extract additional time-based features
            df['Day_of_Week'] = df['Date'].dt.day_name()
            df['Month'] = df['Date'].dt.month_name()
            
            return df

        # Apply preprocessing
        preprocessed_conversation_df = preprocess_conversation_data(conversation_df.copy())
        preprocessed_calendar_df = preprocess_calendar_data(calendar_df.copy())

        return preprocessed_conversation_df, preprocessed_calendar_df

    except Exception as e:
        log_error(e)
        return None, None

# Main Execution Function
def main():
    try:
        # Step 1: Load Datasets
        conversation_df, calendar_df = load_and_examine_datasets()
        
        if conversation_df is None or calendar_df is None:
            print("Failed to load datasets. Exiting.")
            return

        # Step 2: Data Quality Check
        perform_data_quality_check(conversation_df, calendar_df)

        # Step 3: Exploratory Data Analysis
        perform_exploratory_data_analysis(conversation_df, calendar_df)

        # Step 4: Preprocess Data
        preprocessed_conversation_df, preprocessed_calendar_df = preprocess_data(
            conversation_df, 
            calendar_df
        )

        if preprocessed_conversation_df is None or preprocessed_calendar_df is None:
            print("Data preprocessing failed. Exiting.")
            return

        # Step 5: Encoding Categorical Variables
        from sklearn.preprocessing import LabelEncoder

        # Conversation Dataset Encoding
        conversation_label_encoders = {}
        conversation_categorical_cols = [
            'Speaker', 'Intent_Label', 'Language', 'Sentiment'
        ]

        for col in conversation_categorical_cols:
            le = LabelEncoder()
            preprocessed_conversation_df[f'{col}_Encoded'] = le.fit_transform(
                preprocessed_conversation_df[col]
            )
            conversation_label_encoders[col] = le

        # Calendar Dataset Encoding
        calendar_label_encoders = {}
        calendar_categorical_cols = [
            'Availability_Status', 'Meeting_Type', 'Invite_Status', 
            'Location', 'Department', 'Priority'
        ]

        for col in calendar_categorical_cols:
            le = LabelEncoder()
            preprocessed_calendar_df[f'{col}_Encoded'] = le.fit_transform(
                preprocessed_calendar_df[col]
            )
            calendar_label_encoders[col] = le

        # Step 6: Feature Selection for Machine Learning
        conversation_ml_features = [
            'Hour_of_Day', 'Text_Length', 'Speaker_Encoded', 
            'Language_Encoded', 'Sentiment_Encoded'
        ]

        calendar_ml_features = [
            'Meeting_Duration_Minutes', 'Availability_Status_Encoded', 
            'Location_Encoded', 'Department_Encoded'
        ]

        # Prepare features and targets
        X_intent = preprocessed_conversation_df[conversation_ml_features]
        y_intent = preprocessed_conversation_df['Intent_Label_Encoded']

        X_time_slot = preprocessed_calendar_df[calendar_ml_features]
        y_time_slot = preprocessed_calendar_df['Meeting_Type_Encoded']

        # Step 7: Save Preprocessed Datasets
        preprocessed_conversation_df.to_csv(
            'preprocessed_conversation_dataset.csv', 
            index=False
        )
        preprocessed_calendar_df.to_csv(
            'preprocessed_calendar_dataset.csv', 
            index=False
        )

        # Save Label Encoders
        import joblib
        joblib.dump(conversation_label_encoders, 'conversation_label_encoders.pkl')
        joblib.dump(calendar_label_encoders, 'calendar_label_encoders.pkl')

        print("Data Preprocessing Completed Successfully!")

    except Exception as e:
        log_error(e)

# Run the main function
if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import logging
import traceback

# Sklearn Imports
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    StratifiedKFold, 
    GridSearchCV
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    precision_recall_fscore_support,
    roc_auc_score
)

# Machine Learning Algorithms
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Configure Logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='ml_training.log'
)

class SchedulingBotModelTrainer:
    def __init__(self, conversation_df, calendar_df):
        """
        Initialize the model trainer with preprocessed datasets
        """
        self.conversation_df = conversation_df
        self.calendar_df = calendar_df
        
        # Logging
        self.logger = logging.getLogger(__name__)
    
    def prepare_ml_datasets(self):
        """
        Prepare datasets for machine learning
        """
        try:
            # Intent Classification Dataset
            intent_features = [
                'Hour_of_Day', 
                'Text_Length', 
                'Speaker_Encoded', 
                'Language_Encoded', 
                'Sentiment_Encoded'
            ]
            
            X_intent = self.conversation_df[intent_features]
            y_intent = self.conversation_df['Intent_Label_Encoded']
            
            # Time Slot Prediction Dataset
            time_slot_features = [
                'Meeting_Duration_Minutes', 
                'Availability_Status_Encoded', 
                'Location_Encoded', 
                'Department_Encoded'
            ]
            
            X_time_slot = self.calendar_df[time_slot_features]
            y_time_slot = self.calendar_df['Meeting_Type_Encoded']
            
            return {
                'intent': {
                    'features': X_intent,
                    'target': y_intent
                },
                'time_slot': {
                    'features': X_time_slot,
                    'target': y_time_slot
                }
            }
        except Exception as e:
            self.logger.error(f"Error in preparing ML datasets: {e}")
            traceback.print_exc()
            return None
    
    def split_datasets(self, datasets):
        """
        Split datasets into training and testing sets
        """
        try:
            # Intent Classification Split
            X_intent_train, X_intent_test, y_intent_train, y_intent_test = train_test_split(
                datasets['intent']['features'], 
                datasets['intent']['target'], 
                test_size=0.2, 
                random_state=42,
                stratify=datasets['intent']['target']
            )
            
            # Time Slot Prediction Split
            X_time_slot_train, X_time_slot_test, y_time_slot_train, y_time_slot_test = train_test_split(
                datasets['time_slot']['features'], 
                datasets['time_slot']['target'], 
                test_size=0.2, 
                random_state=42,
                stratify=datasets['time_slot']['target']
            )
            
            return {
                'intent': {
                    'X_train': X_intent_train,
                    'X_test': X_intent_test,
                    'y_train': y_intent_train,
                    'y_test': y_intent_test
                },
                'time_slot': {
                    'X_train': X_time_slot_train,
                    'X_test': X_time_slot_test,
                    'y_train': y_time_slot_train,
                    'y_test': y_time_slot_test
                }
            }
        except Exception as e:
            self.logger.error(f"Error in splitting datasets: {e}")
            traceback.print_exc()
            return None
    
    def create_model_pipeline(self, model, param_grid=None):
        """
        Create a machine learning pipeline with optional hyperparameter tuning
        """
        try:
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', model)
            ])
            
            # Hyperparameter tuning if param_grid is provided
            if param_grid:
                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                grid_search = GridSearchCV(
                    pipeline, 
                    param_grid, 
                    cv=cv, 
                    scoring='accuracy', 
                    n_jobs=-1
                )
                return grid_search
            
            return pipeline
        except Exception as e:
            self.logger.error(f"Error in creating model pipeline: {e}")
            traceback.print_exc()
            return None
    
    def train_and_evaluate_models(self, split_data):
        """
        Train and evaluate multiple machine learning models
        """
        models = {
            'Intent Classification': {
                'Random Forest': (
                    RandomForestClassifier(random_state=42),
                    {
                        'classifier__n_estimators': [50, 100, 200],
                        'classifier__max_depth': [None, 10, 20]
                    }
                ),
                'XGBoost': (
                    XGBClassifier(random_state=42),
                    {
                        'classifier__n_estimators': [50, 100, 200],
                        'classifier__learning_rate': [0.01, 0.1, 0.3]
                    }
                ),
                'SVM': (
                    SVC(probability=True),
                    {
                        'classifier__C': [0.1, 1, 10],
                        'classifier__kernel': ['linear', 'rbf']
                    }
                )
            },
            'Time Slot Prediction': {
                'Random Forest': (
                    RandomForestClassifier(random_state=42),
                    {
                        'classifier__n_estimators': [50, 100, 200],
                        'classifier__max_depth': [None, 10, 20]
                    }
                ),
                'Gradient Boosting': (
                    GradientBoostingClassifier(random_state=42),
                    {
                        'classifier__n_estimators': [50, 100, 200],
                        'classifier__learning_rate': [0.01, 0.1, 0.3]
                    }
                )
            }
        }
        
        results = {}
        
        for task, task_models in models.items():
            self.logger.info(f"\n{task} Model Evaluation:")
            
            # Select appropriate split data
            if task == 'Intent Classification':
                X_train = split_data['intent']['X_train']
                X_test = split_data['intent']['X_test']
                y_train = split_data['intent']['y_train']
                y_test = split_data['intent']['y_test']
            else:
                X_train = split_data['time_slot']['X_train']
                X_test = split_data['time_slot']['X_test']
                y_train = split_data['time_slot']['y_train']
                y_test = split_data['time_slot']['y_test']
            
            task_results = {}
            
            for model_name, (model, param_grid) in task_models.items():
                try:
                    # Create pipeline with hyperparameter tuning
                    pipeline = self.create_model_pipeline(model, param_grid)
                    
                    # Train model
                    pipeline.fit(X_train, y_train)
                    
                    # Best parameters and model
                    if hasattr(pipeline, 'best_params_'):
                        self.logger.info(f"Best Parameters for {model_name}: {pipeline.best_params_}")
                        best_model = pipeline.best_estimator_
                    else:
                        best_model = pipeline
                    
                    # Predictions
                    y_pred = best_model.predict(X_test)
                    
                    # Detailed Evaluation
                    accuracy = accuracy_score(y_test, y_pred)
                    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
                    
                    # Additional Metrics
                    try:
                        roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test), multi_class='ovr')
                    except:
                        roc_auc = None
                    
                    # Cross-validation
                    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
                    
                    # Store results
                    task_results[model_name] = {
                        'accuracy': accuracy,
                        'precision': precision,
                        'recall': recall,
                        'f1_score': f1,
                        'roc_auc': roc_auc,
                        'cv_scores': cv_scores,
                        'model': best_model
                    }
                    
                    # Log results
                    self.logger.info(f"\n{model_name} Results:")
                    self.logger.info(f"Accuracy: {accuracy:.4f}")
                    self.logger.info(f"Precision: {precision:.4f}")
                    self.logger.info(f"Recall: {recall:.4f}")
                    self.logger.info(f"F1 Score: {f1:.4f}")
                    if roc_auc:
                        self.logger.info(f"ROC AUC: {roc_auc:.4f}")
                    self.logger.info(f"Cross-Validation Scores: {cv_scores}")
                
                except Exception as e:
                    self.logger.error(f"Error training {model_name}: {e}")
                    traceback.print_exc()
            
            results[task] = task_results
        
        return results
    
    def visualize_model_performance(self, results):
        """
        Create visualizations of model performance
        """
        for task, task_results in results.items():
            # Prepare data for visualization
            model_names = list(task_results.keys())
            accuracies = [result['accuracy'] for result in task_results.values()]
            
            # Create bar plot
            plt.figure(figsize=(10, 6))
            plt.bar(model_names, accuracies)
            plt.title(f'{task} Model Performance Comparison')
            plt.xlabel('Models')
            plt.ylabel('Accuracy')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f'{task.lower().replace(" ", "_")}_model_performance.png')
            plt.close()
    
    def save_best_models(self, results):
        """
        Save the best performing models
        """
        for task, task_results in results.items():
            # Find best model based on accuracy
            best_model_name = max(task_results, key=lambda x: task_results[x]['accuracy'])
            best_model = task_results[best_model_name]['model']
            
            # Save model
            joblib.dump(best_model, f'best_{task.lower().replace(" ", "_")}_model.pkl')
            self.logger.info(f"Best {task} model saved.")

def main():
    try:
        # Load Preprocessed Data
        logging.info("Loading Preprocessed Datasets")
        conversation_df = pd.read_csv('/kaggle/working/preprocessed_conversation_dataset.csv')
        calendar_df = pd.read_csv('/kaggle/working/preprocessed_calendar_dataset.csv')
        
        # Initialize Model Trainer
        trainer = SchedulingBotModelTrainer(conversation_df, calendar_df)
        
        # Prepare Datasets
        ml_datasets = trainer.prepare_ml_datasets()
        
        # Split Datasets
        split_data = trainer.split_datasets(ml_datasets)
        
        # Train and Evaluate Models
        model_results = trainer.train_and_evaluate_models(split_data)
        
        # Visualize Model Performance
        trainer.visualize_model_performance(model_results)
        
        # Save Best Models
        trainer.save_best_models(model_results)
        
        logging.info("Model Training and Evaluation Completed Successfully!")
    
    except Exception as e:
        logging.error(f"Error in main execution: {e}")
        traceback.print_exc()

if __name__ == "__main__":
    main()