# Data Exploration and Analysis
## Machine Learning Exercise 2 - Winter Semester 2025

This notebook is for exploring the datasets and visualizing results.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
import sys
sys.path.append('../src')

from regression_tree import RegressionTree
from random_forest import RandomForest
from utils import load_dataset, preprocess_data, calculate_metrics

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Dataset

In [None]:
# Load your dataset
# X, y = load_dataset('../data/dataset1.csv')
# X, y = preprocess_data(X, y)

# For now, generate sample data
np.random.seed(42)
X = np.random.randn(200, 5)
y = 2 * X[:, 0] + 3 * X[:, 1] - X[:, 2] + np.random.randn(200) * 0.5

print(f"Dataset shape: {X.shape}")
print(f"Target shape: {y.shape}")

## 2. Exploratory Data Analysis

In [None]:
# Create a DataFrame for easier analysis
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y

# Basic statistics
print("Dataset Statistics:")
print(df.describe())

In [None]:
# Distribution of target variable
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.hist(y, bins=30, edgecolor='black')
plt.xlabel('Target Value')
plt.ylabel('Frequency')
plt.title('Distribution of Target Variable')

plt.subplot(1, 2, 2)
plt.boxplot(y)
plt.ylabel('Target Value')
plt.title('Boxplot of Target Variable')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## 3. Train Models

In [None]:
# Split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Train regression tree
tree = RegressionTree(max_depth=5, min_samples_split=5)
tree.fit(X_train, y_train)

y_pred_tree = tree.predict(X_test)
metrics_tree = calculate_metrics(y_test, y_pred_tree)

print("Regression Tree Performance:")
for metric, value in metrics_tree.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Train random forest
rf = RandomForest(n_trees=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
metrics_rf = calculate_metrics(y_test, y_pred_rf)

print("Random Forest Performance:")
for metric, value in metrics_rf.items():
    print(f"{metric}: {value:.4f}")

## 4. Visualize Results

In [None]:
# Predicted vs Actual
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_tree, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'Regression Tree (R²={metrics_tree["R2"]:.4f})')

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred_rf, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'Random Forest (R²={metrics_rf["R2"]:.4f})')

plt.tight_layout()
plt.savefig('../results/figures/predictions_comparison.png', dpi=300)
plt.show()

In [None]:
# Residual plots
residuals_tree = y_test - y_pred_tree
residuals_rf = y_test - y_pred_rf

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_pred_tree, residuals_tree, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Regression Tree - Residual Plot')

plt.subplot(1, 2, 2)
plt.scatter(y_pred_rf, residuals_rf, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Random Forest - Residual Plot')

plt.tight_layout()
plt.savefig('../results/figures/residuals_comparison.png', dpi=300)
plt.show()

## 5. Model Comparison

In [None]:
# Compare different models
comparison_data = {
    'Model': ['Regression Tree', 'Random Forest'],
    'MSE': [metrics_tree['MSE'], metrics_rf['MSE']],
    'RMSE': [metrics_tree['RMSE'], metrics_rf['RMSE']],
    'MAE': [metrics_tree['MAE'], metrics_rf['MAE']],
    'R²': [metrics_tree['R2'], metrics_rf['R2']]
}

comparison_df = pd.DataFrame(comparison_data)
print("\nModel Comparison:")
print(comparison_df.to_string(index=False))

# Visualize comparison
metrics_to_plot = ['MSE', 'RMSE', 'MAE']
x = np.arange(len(metrics_to_plot))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width/2, comparison_df.loc[0, metrics_to_plot], width, label='Regression Tree')
ax.bar(x + width/2, comparison_df.loc[1, metrics_to_plot], width, label='Random Forest')

ax.set_ylabel('Error Value')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics_to_plot)
ax.legend()

plt.tight_layout()
plt.savefig('../results/figures/model_comparison.png', dpi=300)
plt.show()