# House Price Prediction - Regression Analysis
This notebook includes EDA, preprocessing, feature selection, model training, and tuning.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import resample


In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
corr = train_df.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr[['SalePrice']].sort_values(by='SalePrice', ascending=False), annot=True, cmap='coolwarm')
plt.title('Correlation with SalePrice')
plt.show()

In [None]:
sns.boxplot(x='OverallQual', y='SalePrice', data=train_df)
plt.title('SalePrice by OverallQual')
plt.show()

plt.figure(figsize=(14,6))
sns.boxplot(x='Neighborhood', y='SalePrice', data=train_df)
plt.xticks(rotation=45)
plt.title('SalePrice by Neighborhood')
plt.tight_layout()
plt.show()

In [None]:
# Drop high-missing columns
missing = train_df.isnull().sum()
drop_cols = missing[missing > 0.4 * len(train_df)].index.tolist()

train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)

# Impute remaining
cat_cols = train_df.select_dtypes(include='object').columns
num_cols = train_df.select_dtypes(include=['int64', 'float64']).drop(columns=['SalePrice', 'Id']).columns

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])
train_df[cat_cols] = cat_imputer.fit_transform(train_df[cat_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

# One-hot encoding
train_encoded = pd.get_dummies(train_df, drop_first=True)
test_encoded = pd.get_dummies(test_df, drop_first=True)

X = train_encoded.drop(columns=['SalePrice', 'Id'])
y = train_encoded['SalePrice']
X_test = test_encoded.reindex(columns=X.columns, fill_value=0)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [None]:
train_encoded['SalePriceBin'] = pd.qcut(train_encoded['SalePrice'], q=10, labels=False)
parts = []
for label in train_encoded['SalePriceBin'].unique():
    subset = train_encoded[train_encoded['SalePriceBin'] == label]
    sampled = resample(subset, replace=True, n_samples=150, random_state=1)
    parts.append(sampled)

train_bal = pd.concat(parts)
X_bal = train_bal.drop(columns=['SalePrice', 'Id', 'SalePriceBin'])
y_bal = train_bal['SalePrice']
X_bal_scaled = scaler.fit_transform(X_bal)

In [None]:
# Random Forest
param_rf = {'n_estimators': [50, 100], 'max_depth': [5, 10]}
grid_rf = GridSearchCV(RandomForestRegressor(random_state=0), param_rf, scoring='neg_root_mean_squared_error', cv=3)
grid_rf.fit(X_bal_scaled, y_bal)

# Gradient Boosting
param_gb = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}
grid_gb = GridSearchCV(GradientBoostingRegressor(random_state=0), param_gb, scoring='neg_root_mean_squared_error', cv=3)
grid_gb.fit(X_bal_scaled, y_bal)

print("Best RF:", grid_rf.best_params_)
print("Best GB:", grid_gb.best_params_)
print("RF CV RMSE:", -grid_rf.best_score_)
print("GB CV RMSE:", -grid_gb.best_score_)