In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_squared_error, r2_score
)

In [None]:
# step 2: load the cleaned dataset
data = pd.read_csv('../data/processed/aggregated_dataset.csv')
print("Cleaned dataset shape:", data.shape)
data.head()

In [None]:
# step 3: split data into features and labels
# features (exclude NLOS and RANGE)
X = data.drop(columns=['NLOS', 'RANGE'])

y_class = data['NLOS']  # label (0 = LOS, 1 = NLOS)
y_reg = data['RANGE']

In [None]:
# step 4: split data into training and testing Sets
# split for classification (stratified split)
X_train, X_test, y_class_train, y_class_test = train_test_split(
    X, y_class,
    test_size=0.3,
    stratify=y_class,
    random_state=42
)

# split for regression (same indices using random_state=42)
_, _, y_reg_train, y_reg_test = train_test_split(
    X, y_reg,
    test_size=0.3,
    random_state=42
)

# verify shapes
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

In [None]:
# step 5: train random forest models
# initialize models
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_regressor = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# train models
rf_classifier.fit(X_train, y_class_train)
rf_regressor.fit(X_train, y_reg_train)

In [None]:
# step 6: evaluate models
# predict on test set
y_class_pred = rf_classifier.predict(X_test)

# calculate metrics
accuracy = accuracy_score(y_class_test, y_class_pred)
report = classification_report(y_class_test, y_class_pred)
cm = confusion_matrix(y_class_test, y_class_pred)

print(f"Classification Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

# plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['LOS', 'NLOS'], yticklabels=['LOS', 'NLOS'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# regression metrics
# predict on test set
y_reg_pred = rf_regressor.predict(X_test)

# calculate metrics
rmse = np.sqrt(mean_squared_error(y_reg_test, y_reg_pred))
r2 = r2_score(y_reg_test, y_reg_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# plot predicted vs actual
plt.figure(figsize=(8, 6))
plt.scatter(y_reg_test, y_reg_pred, alpha=0.3)
plt.plot([y_reg_test.min(), y_reg_test.max()], [y_reg_test.min(), y_reg_test.max()], 'k--', lw=2)
plt.xlabel('Actual Range')
plt.ylabel('Predicted Range')
plt.title('Actual vs Predicted Range')
plt.show()

In [None]:
# step 7: feature importance analysis
# get feature importances
feature_importances_class = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_classifier.feature_importances_
}).sort_values('Importance', ascending=False)

feature_importances_reg = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_regressor.feature_importances_
}).sort_values('Importance', ascending=False)

# plot classification feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances_class.head(10))
plt.title('Top 10 Features for LOS/NLOS Classification')
plt.show()

# plot regression feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances_reg.head(10))
plt.title('Top 10 Features for Range Prediction')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_class_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")