<a href="https://colab.research.google.com/github/k-ganda/database_design_pld5/blob/main/database_design.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
import requests
import joblib

class UserBehaviorPredictor:
    def __init__(self):
        # Load the dataset
        self.df = pd.read_csv('user_behavior_dataset.csv')
        self.model = None
        self.scaler = StandardScaler()

    def prepare_data(self):
        # Separate features and target
        X = self.df.drop(['User ID', 'User Behavior Class'], axis=1)
        y = self.df['User Behavior Class']

        # Convert categorical variables
        X = pd.get_dummies(X, columns=['Device Model', 'Operating System', 'Gender'])

        # Scale numerical features
        numerical_cols = ['App Usage Time (min/day)', 'Screen On Time (hours/day)',
                         'Battery Drain (mAh/day)', 'Number of Apps Installed',
                         'Data Usage (MB/day)', 'Age']
        X[numerical_cols] = self.scaler.fit_transform(X[numerical_cols])

        return X, y

    def train_model(self):
        # Prepare data
        X, y = self.prepare_data()

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y  # Added stratification
        )

        # Define parameter grid for GridSearchCV
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 8, 10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2']
        }

        # Initialize base model
        base_model = RandomForestClassifier(
            random_state=42,
            class_weight='balanced',  # Handle class imbalance
            oob_score=True  # Enable out-of-bag score
        )

        # Perform grid search with cross-validation
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            cv=5,
            scoring='balanced_accuracy',
            n_jobs=-1,
            verbose=1
        )

        # Fit grid search
        grid_search.fit(X_train, y_train)

        # Get best model
        self.model = grid_search.best_estimator_

        # Print model performance
        train_score = self.model.score(X_train, y_train)
        test_score = self.model.score(X_test, y_test)
        cv_scores = cross_val_score(self.model, X, y, cv=5)

        print("Model Performance:")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Training accuracy: {train_score:.3f}")
        print(f"Testing accuracy: {test_score:.3f}")
        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
        print(f"OOB score: {self.model.oob_score_:.3f}")

        # Print detailed classification report
        y_pred = self.model.predict(X_test)
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

        # Feature importance analysis
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        print("\nTop 10 Most Important Features:")
        print(feature_importance.head(10))

        # Save the model and scaler
        joblib.dump(self.model, 'user_behavior_rf_model.joblib')
        joblib.dump(self.scaler, 'scaler.joblib')

    def fetch_latest_entry(self, api_url="https://database-design-pld5.onrender.com/docs#/"):
        """
        Fetch the latest entry from the API
        """
        try:
            response = requests.get(api_url)
            if response.status_code == 200:
                data = response.json()
                print("Latest entry fetched:", data)
                return data
            else:
                print(f"Error fetching data: {response.status_code}")
                return None
        except Exception as e:
            print(f"Error connecting to API: {str(e)}")
            return None

    def prepare_input_data(self, input_data):
        """
        Prepare a single input entry for prediction
        """
        # Convert input to DataFrame
        input_df = pd.DataFrame([input_data])

        # Drop User ID if present
        if 'User ID' in input_df.columns:
            input_df = input_df.drop('User ID', axis=1)

        # Create dummy variables
        input_df = pd.get_dummies(input_df, columns=['Device Model', 'Operating System', 'Gender'])

        # Ensure all columns from training are present
        for col in self.model.feature_names_in_:
            if col not in input_df.columns:
                input_df[col] = 0

        # Reorder columns to match training data
        input_df = input_df[self.model.feature_names_in_]

        return input_df

    def predict(self, input_data):
        """
        Make prediction for new input data
        """
        if self.model is None:
            raise ValueError("Model has not been trained yet. Please train the model first.")

        # Prepare input data
        prepared_data = self.prepare_input_data(input_data)

        # Make prediction
        prediction = self.model.predict(prepared_data)
        probabilities = self.model.predict_proba(prepared_data)

        # Get prediction confidence
        confidence = np.max(probabilities) * 100

        return {
            'predicted_class': int(prediction[0]),
            'confidence': f"{confidence:.2f}%",
            'probabilities': {f"Class {i}": f"{prob:.2f}%"
                            for i, prob in enumerate(probabilities[0])}
        }

# Example usage
if __name__ == "__main__":
    # Initialize predictor
    predictor = UserBehaviorPredictor()

    # Train the model
    print("Training model...")
    predictor.train_model()

    print("\nFetching latest entry...")

    sample_entry = {
        'Device Model': 'iPhone 12',
        'Operating System': 'iOS',
        'App Usage Time (min/day)': 187,
        'Screen On Time (hours/day)': 4.3,
        'Battery Drain (mAh/day)': 1367,
        'Number of Apps Installed': 58,
        'Data Usage (MB/day)': 988,
        'Age': 31,
        'Gender': 'Female'
    }

    print("\nMaking prediction for sample entry:")
    prediction_result = predictor.predict(sample_entry)
    print("\nPrediction Results:")
    print(f"Predicted User Behavior Class: {prediction_result['predicted_class']}")
    print(f"Prediction Confidence: {prediction_result['confidence']}")
    print("\nClass Probabilities:")
    for class_name, probability in prediction_result['probabilities'].items():
        print(f"{class_name}: {probability}")

Training model...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Model Performance:
Best parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Training accuracy: 1.000
Testing accuracy: 1.000
Cross-validation scores: [1. 1. 1. 1. 1.]
Mean CV score: 1.000 (+/- 0.000)
OOB score: 1.000

Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        27
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        29
           4       1.00      1.00      1.00        28
           5       1.00      1.00      1.00        27

    accuracy                           1.00       140
   macro avg       1.00      1.00      1.00       140
weighted avg       1.00      1.00      1.00       140


Top 10 Most Important Features:
                            feature  importance
3          Number of Apps Installed    0.257632
4               Data Usage (MB/day)    0.236124
2           Battery Drain