In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression

import warnings
warnings.filterwarnings("ignore")

# Load the training and test datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

# Remove features with excessive missing values
na_threshold = 0.9
train = train.loc[:, train.isnull().mean() < na_threshold]
test = test[train.columns.drop('SalePrice')]

# Fill missing values with mode
train.fillna(train.mode().iloc[0], inplace=True)
test.fillna(test.mode().iloc[0], inplace=True)

# Feature Engineering: Combine square footage features
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

# Prepare target variable
y = np.log1p(train["SalePrice"])
X = train.drop("SalePrice", axis=1)

# Encode categorical variables using one-hot encoding
X = pd.get_dummies(X)
test = pd.get_dummies(test)
X, test = X.align(test, join='left', axis=1, fill_value=0)

# Select top 100 features using univariate regression
selector = SelectKBest(score_func=f_regression, k=100)
X = selector.fit_transform(X, y)
test = selector.transform(test)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)
test_pca = pca.transform(test)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define a function to evaluate regression models
def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)
    print(f"{name:<25} | RMSE: {rmse:.4f} | R²: {r2:.4f}")
    return model, rmse

# Evaluate multiple models
print("\nEvaluating Models:")
# 1. Simple Linear Regression (single feature)
simple_X = X_train[:, [0]].reshape(-1, 1)
simple_val = X_val[:, [0]].reshape(-1, 1)
evaluate_model("Simple Linear Regression", LinearRegression(), simple_X, y_train, simple_val, y_val)

# 2. Multiple Linear Regression
evaluate_model("Multiple Linear Regression", LinearRegression(), X_train, y_train, X_val, y_val)

# 3. Polynomial Regression (degree 2)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
evaluate_model("Polynomial Regression", LinearRegression(), X_poly, y_train, X_val_poly, y_val)

# 4. Ridge Regression
evaluate_model("Ridge Regression", Ridge(alpha=10), X_train, y_train, X_val, y_val)

# 5. Lasso Regression
evaluate_model("Lasso Regression", Lasso(alpha=0.001), X_train, y_train, X_val, y_val)

# 6. ElasticNet Regression
evaluate_model("ElasticNet Regression", ElasticNet(alpha=0.001, l1_ratio=0.5), X_train, y_train, X_val, y_val)

# 7. Decision Tree Regressor
evaluate_model("Decision Tree Regressor", DecisionTreeRegressor(max_depth=5), X_train, y_train, X_val, y_val)

# 8. Random Forest Regressor
evaluate_model("Random Forest Regressor", RandomForestRegressor(n_estimators=100, random_state=42), X_train, y_train, X_val, y_val)

# 9. Support Vector Regressor (scaled)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
evaluate_model("Support Vector Regressor", SVR(kernel='rbf', C=20, epsilon=0.2), X_scaled, y_train, X_val_scaled, y_val)

# 10. K-Nearest Neighbors Regressor
evaluate_model("KNN Regressor", KNeighborsRegressor(n_neighbors=5), X_train, y_train, X_val, y_val)

# Grid Search for Random Forest Hyperparameters
print("\nPerforming Grid Search on Random Forest...")
params = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, None]
}
grid = GridSearchCV(RandomForestRegressor(random_state=42), params, cv=3, scoring='neg_root_mean_squared_error')
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
print(f"Best Parameters for Random Forest: {grid.best_params_}")

# Final Evaluation using Best Model
y_pred = best_model.predict(X_val)
print("\nFinal Evaluation Metrics:")
print(f"Mean Squared Error (MSE) : {mean_squared_error(y_val, y_pred):.4f}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mean_squared_error(y_val, y_pred)):.4f}")
print(f"R-squared (R²) Score      : {r2_score(y_val, y_pred):.4f}")

# t-SNE Visualization for dimensionality reduction (2D)
print("\nGenerating t-SNE visualization...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_pca)
plt.figure(figsize=(10, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='coolwarm', s=5)
plt.title("t-SNE Projection of House Prices")
plt.colorbar(label='Log SalePrice')
plt.show()

# Generate final submission using best model
final_preds = np.expm1(best_model.predict(test_pca))
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": final_preds
})
submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' has been created.")



Evaluating Models:
Simple Linear Regression  | RMSE: 0.4188 | R²: 0.0600
Multiple Linear Regression | RMSE: 0.1487 | R²: 0.8815
Polynomial Regression     | RMSE: 10.0391 | R²: -539.0767
Ridge Regression          | RMSE: 0.1487 | R²: 0.8815
Lasso Regression          | RMSE: 0.1492 | R²: 0.8807
ElasticNet Regression     | RMSE: 0.1487 | R²: 0.8816
Decision Tree Regressor   | RMSE: 0.2693 | R²: 0.6115
Random Forest Regressor   | RMSE: 0.1920 | R²: 0.8024
Support Vector Regressor  | RMSE: 0.2139 | R²: 0.7547
KNN Regressor             | RMSE: 0.2198 | R²: 0.7410

Performing Grid Search on Random Forest...
