In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Load the Diamonds dataset from seaborn
df = sns.load_dataset("diamonds.csv")

# Display first few rows before preprocessing
display(df.head())

# Drop irrelevant columns and handle missing values
df.dropna(inplace=True)  # Removing missing values

# Manually define categorical features (ensure they are correctly detected)
categorical_features = ["cut", "color", "clarity"]  # Explicitly specifying categorical features
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns.drop("price")

# Convert categorical features to string (if they were mistakenly numerical)
for col in categorical_features:
    df[col] = df[col].astype(str)

# Print detected categorical features
print("Categorical features detected:", categorical_features)

# Verify unique values in categorical features
print("Unique categories per feature before encoding:")
for col in categorical_features:
    print(f"{col}: {df[col].unique()}")

# Print the number of samples and features before encoding and scaling
print(f"Number of samples: {df.shape[0]}")
print(f"Number of features before encoding and scaling: {df.shape[1] - 1}")  # Excluding target variable

"""
# TODO: Task 1 - Uncomment the following section to apply One-Hot Encoding and Feature Scaling
# Instructions:
# - Uncomment the code below.
# - Run the program and compare results with and without encoding/scaling.

from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Encode categorical variables using One-Hot Encoding
encoder = OneHotEncoder(drop="first", sparse_output=False)  # Drop first category to avoid redundancy.
categorical_encoded = encoder.fit_transform(df[categorical_features])

# Convert encoded features into a DataFrame with proper column names
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(input_features=categorical_features))

# Print the number of encoded features after dropping the first category
print(f"Number of encoded features after dropping first category: {categorical_encoded_df.shape[1]}")

# Standardize numerical features to bring them to the same scale
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(df[numerical_features])
numerical_scaled_df = pd.DataFrame(numerical_scaled, columns=numerical_features)

# Combine processed features into a single dataset
X = pd.concat([numerical_scaled_df, categorical_encoded_df], axis=1)
"""

# ---- TEMPORARY: Using raw numerical data ----
# TODO: Task 1: Comment the following line to use the encoded and scaled features
# Since encoding and scaling are commented out, we will use only numerical features without transformation.
X = df[numerical_features]  # Directly using numerical features without transformation

# Target variable
y = df["price"]

# Print the number of features after skipping encoding and scaling
print(f"Number of features used in model: {X.shape[1]}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

"""
# TODO: Task 2 - Uncomment the following section to apply Ridge Regularization (L2)
# Instructions:
# - Uncomment the code below.
# - Compare results with and without regularization.
# - Adjust the alpha parameter to see how regularization strength affects performance.

alpha_value = 1  # Regularization strength, can be adjusted
ridge_model = Ridge(alpha=alpha_value)
ridge_model.fit(X_train, y_train)

# Predict on test set using Ridge Regression
y_pred_ridge = ridge_model.predict(X_test)
"""

# Predict on test set using Linear Regression
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (Linear Regression): {mse:.2f}")
print(f"R-squared Score (Linear Regression): {r2:.4f}")

"""
# TODO: Task 2 - Evaluate Ridge Regression
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Mean Squared Error (Ridge Regression, alpha={alpha_value}): {mse_ridge:.2f}")
print(f"R-squared Score (Ridge Regression, alpha={alpha_value}): {r2_ridge:.4f}")
"""

# Plot actual vs predicted values
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.5, label="Linear Regression", color="blue")

"""
# TODO: Task 2 - Add Ridge Regression to the plot
sns.scatterplot(x=y_test, y=y_pred_ridge, alpha=0.5, label="Ridge Regression", color="red")
"""

plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Prices")
plt.legend()
plt.show()

# ---- LAB EXERCISE INSTRUCTIONS ----
# Step 1: Run the code as it is and note the model performance.
# Step 2: Uncomment Task 1 (Encoding and Scaling), run again, and compare results.
# Step 3: Uncomment Task 2 (Ridge Regularization), run again, and compare results.
# Step 4: Adjust the alpha value in Ridge Regression to observe its effect on performance.


ValueError: 'diamonds.csv' is not one of the example datasets.