# House Prices Regression Analysis
This notebook follows a complete pipeline: EDA, preprocessing, modeling, and evaluation using linear models.

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

# Plot aesthetics
sns.set_context("paper", rc={"font.size":15, "axes.titlesize":15, "axes.labelsize":15})  
plt.rcParams['axes.labelsize']  = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Scikit-learn utilities
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Regression models
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor

# Statsmodels (optional for OLS summary)
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, f1_score

# Dimensionality reduction (optional)
from sklearn.decomposition import PCA

# Inline plotting for Jupyter
%matplotlib inline


## Step 2: Load and Inspect the Data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Convert 'NA' strings in categorical columns to np.nan
for df in [train, test]:
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].replace('NA', np.nan)

train.head()

## Step 3: Handle Missing Values

In [None]:
# Check for null values using .info(), isnull().any() or any other way

train.info()
train.isnull().any()

print(train.isnull().sum())

test.info()
test.isnull().any()

print(test.isnull().sum())

# Drop columns with >40% missing values
drop_cols = train.columns[train.isnull().mean() > 0.4]
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

# Impute missing values
num_cols = train.select_dtypes(include=['float64', 'int64']).drop(columns=['Id', 'SalePrice']).columns
cat_cols = train.select_dtypes(include='object').columns

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

train[num_cols] = num_imputer.fit_transform(train[num_cols])
test[num_cols] = num_imputer.transform(test[num_cols])
train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])
test[cat_cols] = cat_imputer.transform(test[cat_cols])

## Step 4: Exploratory Data Analysis (EDA)

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
corr = train.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr[['SalePrice']].sort_values(by='SalePrice', ascending=False), annot=True, cmap='coolwarm')
plt.title('Feature Correlation with SalePrice')
plt.show()

## Step 5: Encoding and Scaling

In [None]:
# One-hot encoding
train_encoded = pd.get_dummies(train, drop_first=True)
test_encoded = pd.get_dummies(test, drop_first=True)

# Align columns
X = train_encoded.drop(columns=['SalePrice', 'Id'])
y = train_encoded['SalePrice']
X_test = test_encoded.reindex(columns=X.columns, fill_value=0)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

## Step 6: Feature Importance

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_scaled_df, y)
importances = pd.Series(rf.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(20)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_features.values, y=top_features.index)
plt.title('Top 20 Feature Importances (Random Forest)')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## Step 7: Modeling and Evaluation

In [None]:
# Define models
from sklearn.pipeline import Pipeline
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.01),
    'Polynomial Regression (deg 2)': Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('linreg', LinearRegression())])
}

# Use top features for poly regression
X_poly = X_scaled_df[top_features.index]
results = []

for name, model in models.items():
    if 'Polynomial' in name:
        model.fit(X_poly, y)
        y_pred = model.predict(X_poly)
    else:
        model.fit(X_scaled_df, y)
        y_pred = model.predict(X_scaled_df)
    
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2 = r2_score(y, y_pred)
    y_bin = pd.qcut(y, q=4, labels=False)
    y_pred_bin = pd.qcut(pd.Series(y_pred).rank(method='first'), q=4, labels=False)
    f1 = f1_score(y_bin, y_pred_bin, average='macro')
    results.append({'Model': name, 'Train RMSE': round(rmse, 2), 'Train R²': round(r2, 3), 'F1 Score': round(f1, 3)})

pd.DataFrame(results).sort_values(by='Train RMSE')