<a href="https://colab.research.google.com/github/k-ganda/database_design_pld5/blob/main/database_design.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import requests
import joblib
from typing import Dict, Optional, Any
import logging
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class UserBehaviorPredictor:
    def __init__(self, random_state: int = 42):
        self.logger = logging.getLogger(__name__)
        self.random_state = random_state
        self.model = None
        self.scaler = StandardScaler()
        self.feature_names = None

        # Configure retry strategy for API requests
        self.session = requests.Session()
        retry_strategy = Retry(
            total=3,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504]
        )
        self.session.mount('http://', HTTPAdapter(max_retries=retry_strategy))

    def load_dataset(self, filepath: str) -> None:
        """Load and validate the dataset"""
        try:
            self.logger.info("Loading dataset...")
            self.df = pd.read_csv(filepath)
            self._validate_dataset()
            self.logger.info(f"Dataset loaded with shape: {self.df.shape}")
        except Exception as e:
            self.logger.error(f"Error loading dataset: {str(e)}")
            raise

    def _validate_dataset(self) -> None:
        """Validate dataset structure and contents"""
        required_columns = {
            'User ID', 'User Behavior Class', 'App Usage Time (min/day)',
            'Screen On Time (hours/day)', 'Battery Drain (mAh/day)',
            'Number of Apps Installed', 'Data Usage (MB/day)', 'Age',
            'Device Model', 'Operating System', 'Gender'
        }

        missing_columns = required_columns - set(self.df.columns)
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Check for null values
        null_counts = self.df.isnull().sum()
        if null_counts.any():
            self.logger.warning(f"Found null values:\n{null_counts[null_counts > 0]}")

    def prepare_data(self):
        """Prepare data for training with improved error handling"""
        self.logger.info("Preparing data...")
        try:
            X = self.df.drop(['User ID', 'User Behavior Class'], axis=1)
            y = self.df['User Behavior Class']

            # Handle categorical variables
            categorical_columns = ['Device Model', 'Operating System', 'Gender']
            X = pd.get_dummies(X, columns=categorical_columns)

            # Scale numerical features with error checking
            numerical_cols = [
                'App Usage Time (min/day)', 'Screen On Time (hours/day)',
                'Battery Drain (mAh/day)', 'Number of Apps Installed',
                'Data Usage (MB/day)', 'Age'
            ]

            # Check for infinite or null values
            if X[numerical_cols].isin([np.inf, -np.inf]).any().any():
                self.logger.warning("Infinite values found in numerical columns")
                X[numerical_cols] = X[numerical_cols].replace([np.inf, -np.inf], np.nan)

            X[numerical_cols] = self.scaler.fit_transform(X[numerical_cols])
            self.feature_names = X.columns

            return X, y

        except Exception as e:
            self.logger.error(f"Error in data preparation: {str(e)}")
            raise

    def train_model(self):
        """Train model with improved hyperparameters and validation"""
        try:
            X, y = self.prepare_data()

            # Split the data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=self.random_state
            )

            # Initialize model with better parameters to prevent overfitting
            self.model = RandomForestClassifier(
                n_estimators=100,
                max_depth=None,  # Let trees grow fully
                min_samples_split=5,  # Minimum samples required to split
                min_samples_leaf=2,   # Minimum samples required at leaf node
                max_features='sqrt',  # Use sqrt of features for each split
                random_state=self.random_state,
                class_weight='balanced'  # Handle class imbalance
            )

            # Perform cross-validation
            self.logger.info("Running cross-validation...")
            cv_scores = cross_val_score(self.model, X_train, y_train, cv=5)
            self.logger.info(f"Cross-validation scores: {cv_scores}")
            self.logger.info(f"Mean CV accuracy: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")

            # Train the model
            self.logger.info("Training model...")
            self.model.fit(X_train, y_train)

            # Evaluate model
            train_score = self.model.score(X_train, y_train)
            test_score = self.model.score(X_test, y_test)
            self.logger.info(f"Training accuracy: {train_score:.3f}")
            self.logger.info(f"Testing accuracy: {test_score:.3f}")

            # Generate detailed classification report
            y_pred = self.model.predict(X_test)
            self.logger.info("\nClassification Report:")
            self.logger.info(f"\n{classification_report(y_test, y_pred)}")

            # Save model artifacts
            self._save_model()

        except Exception as e:
            self.logger.error(f"Error in model training: {str(e)}")
            raise

    def _save_model(self) -> None:
        """Save model and associated artifacts"""
        try:
            joblib.dump(self.model, 'user_behavior_rf_model.joblib')
            joblib.dump(self.scaler, 'scaler.joblib')
            joblib.dump(self.feature_names, 'feature_names.joblib')
            self.logger.info("Model artifacts saved successfully")
        except Exception as e:
            self.logger.error(f"Error saving model artifacts: {str(e)}")
            raise

    def fetch_latest_entry(self, api_url: str = "http://localhost:8000/userbehaviors",
                          timeout: int = 5) -> Optional[Dict[str, Any]]:
        """Fetch latest entry with improved error handling and timeout"""
        self.logger.info("Fetching latest entry from API...")
        try:
            response = self.session.get(api_url, timeout=timeout)
            response.raise_for_status()
            data = response.json()
            self.logger.info("Latest entry fetched successfully")
            return data
        except requests.exceptions.ConnectionError:
            self.logger.error(f"Connection error: Could not connect to {api_url}")
        except requests.exceptions.Timeout:
            self.logger.error("Request timed out")
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Error fetching data: {str(e)}")
        return None

    def predict(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """Make prediction with improved error handling and validation"""
        try:
            # Load model if not already loaded
            if not self.model:
                self._load_model()

            # Prepare and validate input
            prepared_data = self._prepare_prediction_input(input_data)

            # Make prediction
            prediction = self.model.predict(prepared_data)
            probabilities = self.model.predict_proba(prepared_data)

            # Calculate confidence
            confidence = np.max(probabilities) * 100

            result = {
                'predicted_class': int(prediction[0]),
                'confidence': f"{confidence:.2f}%",
                'probabilities': {
                    f"Class {i}": f"{prob:.2f}%"
                    for i, prob in enumerate(probabilities[0])
                }
            }

            self.logger.info(f"Prediction made with confidence: {confidence:.2f}%")
            return result

        except Exception as e:
            self.logger.error(f"Error making prediction: {str(e)}")
            raise

    def _load_model(self) -> None:
        """Load saved model and artifacts"""
        try:
            self.model = joblib.load('user_behavior_rf_model.joblib')
            self.scaler = joblib.load('scaler.joblib')
            self.feature_names = joblib.load('feature_names.joblib')
            self.logger.info("Model loaded successfully")
        except FileNotFoundError:
            self.logger.error("Model files not found. Please train the model first.")
            raise
        except Exception as e:
            self.logger.error(f"Error loading model: {str(e)}")
            raise

if __name__ == "__main__":
    predictor = UserBehaviorPredictor()

    try:
        # Load and train model
        predictor.load_dataset('user_behavior_dataset.csv')
        predictor.train_model()

        # Attempt to fetch and process latest entry
        latest_entry = predictor.fetch_latest_entry()
        if latest_entry:
            result = predictor.predict(latest_entry)
            predictor.logger.info("Prediction result:", result)
        else:
            predictor.logger.warning("No data available for prediction")

    except Exception as e:
        predictor.logger.error(f"Application error: {str(e)}")

ERROR:__main__:Connection error: Could not connect to http://localhost:8000/userbehaviors
