In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
from joblib import dump

# Load data from CSV file
def load_data(csv_file_path):
    # Read the CSV file
    df = pd.read_csv(csv_file_path)

    # Convert columns to numeric (in case they're loaded as strings)
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Display data information
    print("Data overview:")
    print(df.head())
    print("\nClass distribution:")
    print(df['Human Status'].value_counts())

    return df

# Prepare data for modeling
def prepare_data(df):
    # Extract features and target
    X = df[['Receiver 1', 'Receiver 2', 'Receiver 3']].values
    y = df['Human Status'].values

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

# Function to train the Random Forest model with hyperparameter tuning
def train_random_forest(X_train, y_train):
    # Create parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create base model
    rf = RandomForestClassifier(random_state=42)

    # Create GridSearchCV object
    print("Performing grid search for best hyperparameters...")
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        verbose=1,
        scoring='accuracy'
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Get best model
    best_rf = grid_search.best_estimator_

    # Print best parameters
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

    return best_rf

# Evaluate model and visualize results
def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy: {accuracy:.4f}")

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred,
                                target_names=['Empty (0)', 'Stationary (1)', 'Moving (2)']))

    # Confusion matrix
    plt.figure(figsize=(10, 7))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Empty', 'Stationary', 'Moving'],
                yticklabels=['Empty', 'Stationary', 'Moving'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix - Random Forest')
    plt.savefig('rf_confusion_matrix.png')
    plt.close()

    # Feature importance
    feature_importance = model.feature_importances_
    features = ['Receiver 1', 'Receiver 2', 'Receiver 3']

    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature_importance, y=features)
    plt.title('Feature Importance - Random Forest')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig('rf_feature_importance.png')
    plt.close()

    return y_pred

# Function to make predictions on new data
def predict_human_status(model, new_data, scaler):
    # Scale new data
    new_data_scaled = scaler.transform(new_data)

    # Make predictions
    predicted_classes = model.predict(new_data_scaled)
    predicted_proba = model.predict_proba(new_data_scaled)

    # Map predictions to labels
    status_map = {0: 'Empty Room', 1: 'Stationary Person', 2: 'Moving Person'}
    predicted_labels = [status_map[cls] for cls in predicted_classes]

    return predicted_classes, predicted_labels, predicted_proba

# Main execution function
def main(csv_file_path):
    # Load and prepare data
    df = load_data(csv_file_path)
    X_train, X_test, y_train, y_test, scaler = prepare_data(df)

    # Train Random Forest model
    print("Training Random Forest model...")
    rf_model = train_random_forest(X_train, y_train)

    # Evaluate model
    print("Evaluating model...")
    y_pred = evaluate_model(rf_model, X_test, y_test)

    # Save the model and scaler
    dump(rf_model, 'rssi_rf_model.joblib')
    dump(scaler, 'rssi_rf_scaler.joblib')
    print("Model saved as 'rssi_rf_model.joblib'")
    print("Scaler saved as 'rssi_rf_scaler.joblib'")

    # Example of making predictions with new data
    print("\nExample of making predictions with sample data:")
    sample_data = np.array([
        [-37, -42, -34],  # Example of empty room
        [-41, -43, -35],  # Example of stationary person
        [-38, -35, -37]   # Example of moving person
    ])

    predicted_classes, predicted_labels, _ = predict_human_status(rf_model, sample_data, scaler)

    for i, (data, pred_class, pred_label) in enumerate(zip(sample_data, predicted_classes, predicted_labels)):
        print(f"Sample {i+1}: RSSI values {data} -> Predicted class: {pred_class} ({pred_label})")

    return rf_model, scaler

if __name__ == "__main__":
    # Specify the path to your CSV file
    csv_file_path = "https://raw.githubusercontent.com/ishancoderr/WiSee/refs/heads/main/data/finalDataset.csv"

    # Run the main function
    rf_model, scaler = main(csv_file_path)

Data overview:
   Tile No  Receiver 1  Receiver 2  Receiver 3  Human Status
0        1         -37         -42         -34             0
1        1         -37         -42         -32             0
2        1         -37         -42         -32             0
3        1         -36         -41         -34             0
4        1         -39         -42         -35             0

Class distribution:
Human Status
2    818
1    734
0    629
Name: count, dtype: int64
Training Random Forest model...
Performing grid search for best hyperparameters...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation score: 0.9730
Evaluating model...
Test accuracy: 0.9748

Classification Report:
                precision    recall  f1-score   support

     Empty (0)       0.98      0.98      0.98       126
Stationary (1)       0.96      0.98      0.97       147
    Moving