<a href="https://colab.research.google.com/github/k-ganda/database_design_pld5/blob/main/database_design.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
import requests
import joblib

class UserBehaviorPredictor:
    def __init__(self):
        # Load the dataset
        print("Loading dataset...")
        self.df = pd.read_csv('user_behavior_dataset.csv')
        print("Dataset loaded with shape:", self.df.shape)

        # Initialize model and scaler
        self.model = None
        self.scaler = StandardScaler()

    def prepare_data(self):
        # Separate features and target
        print("Preparing data...")
        X = self.df.drop(['User ID', 'User Behavior Class'], axis=1)
        y = self.df['User Behavior Class']

        # Convert categorical variables
        X = pd.get_dummies(X, columns=['Device Model', 'Operating System', 'Gender'])
        print("Categorical columns converted.")

        # Scale numerical features
        numerical_cols = [
            'App Usage Time (min/day)', 'Screen On Time (hours/day)',
            'Battery Drain (mAh/day)', 'Number of Apps Installed',
            'Data Usage (MB/day)', 'Age'
        ]
        X[numerical_cols] = self.scaler.fit_transform(X[numerical_cols])
        print("Numerical columns scaled.")

        return X, y

    def train_model(self):
        # Prepare data
        X, y = self.prepare_data()

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        print("Data split into training and testing sets.")

        # Initialize Random Forest model
        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        )

        # Cross-validation to assess overfitting
        print("Running cross-validation for model accuracy...")
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=5)
        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean cross-validation accuracy: {np.mean(cv_scores):.3f}")

        # Train the model
        print("Training model...")
        self.model.fit(X_train, y_train)

        # Print model performance
        train_score = self.model.score(X_train, y_train)
        test_score = self.model.score(X_test, y_test)
        print(f"Training accuracy: {train_score:.3f}")
        print(f"Testing accuracy: {test_score:.3f}")

        # Save the model and scaler
        joblib.dump(self.model, 'user_behavior_rf_model.joblib')
        joblib.dump(self.scaler, 'scaler.joblib')
        print("Model and scaler saved.")

    def fetch_latest_entry(self, api_url="https://database-design-pld5.onrender.com/docs#/"):
        """
        Fetch the latest entry from the API
        """
        print("Fetching the latest entry from API...")
        try:
            response = requests.get(api_url)
            if response.status_code == 200:
                data = response.json()
                print("Latest entry fetched:", data)
                return data
            else:
                print(f"Error fetching data: {response.status_code}")
                return None
        except Exception as e:
            print(f"Error connecting to API: {str(e)}")
            return None

    def prepare_input_data(self, input_data):
        """
        Prepare a single input entry for prediction
        """
        # Convert input to DataFrame
        input_df = pd.DataFrame([input_data])
        print("Input data converted to DataFrame.")

        # Drop User ID if present
        if 'User ID' in input_df.columns:
            input_df = input_df.drop('User ID', axis=1)

        # Create dummy variables
        input_df = pd.get_dummies(input_df, columns=['Device Model', 'Operating System', 'Gender'])

        # Ensure all columns from training are present
        for col in self.model.feature_names_in_:
            if col not in input_df.columns:
                input_df[col] = 0

        # Reorder columns to match training data
        input_df = input_df[self.model.feature_names_in_]
        print("Input data prepared for prediction.")

        return input_df

    def predict(self, input_data):
        """
        Make prediction for new input data
        """
        # Prepare input data
        prepared_data = self.prepare_input_data(input_data)

        # Load the model and scaler if not already loaded
        if not self.model:
            print("Loading pre-trained model and scaler...")
            self.model = joblib.load('user_behavior_rf_model.joblib')
            self.scaler = joblib.load('scaler.joblib')

        # Make prediction
        prediction = self.model.predict(prepared_data)
        probabilities = self.model.predict_proba(prepared_data)

        # Get prediction confidence
        confidence = np.max(probabilities) * 100
        print(f"Prediction made with confidence: {confidence:.2f}%")

        return {
            'predicted_class': int(prediction[0]),
            'confidence': f"{confidence:.2f}%",
            'probabilities': {f"Class {i}": f"{prob:.2f}%"
                            for i, prob in enumerate(probabilities[0])}
        }

# Example usage
if __name__ == "__main__":
    predictor = UserBehaviorPredictor()
    predictor.train_model()
    latest_entry = predictor.fetch_latest_entry()

    if latest_entry:
        result = predictor.predict(latest_entry)
        print("Prediction result:", result)


Loading dataset...
Dataset loaded with shape: (700, 11)
Preparing data...
Categorical columns converted.
Numerical columns scaled.
Data split into training and testing sets.
Running cross-validation for model accuracy...
Cross-validation scores: [1. 1. 1. 1. 1.]
Mean cross-validation accuracy: 1.000
Training model...
Training accuracy: 1.000
Testing accuracy: 1.000
Model and scaler saved.
Fetching the latest entry from API...
Error connecting to API: Expecting value: line 2 column 5 (char 5)
