## Regression Analysis

### Step 1: Load the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')

### Step 2: Load the Data
We will read the training and test data.

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
train.head()

### Step 3: Data Exploration
Let's look at the summary of the dataset.

In [None]:
train.info()
train.describe()

### Step 4: Data Cleaning
We will drop columns with too many missing values.

In [None]:
# Drop columns with more than 40% missing data
missing = train.isnull().sum()
drop_cols = missing[missing > 0.4 * len(train)].index.tolist()
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

### Step 5: Handle Missing Values
We will impute numerical and categorical columns.

In [None]:
# Separate columns
cat_cols = train.select_dtypes(include='object').columns
num_cols = train.select_dtypes(include=['int64', 'float64']).drop(['Id', 'SalePrice'], axis=1).columns

# Impute
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
train[num_cols] = num_imputer.fit_transform(train[num_cols])
train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])
test[num_cols] = num_imputer.transform(test[num_cols])
test[cat_cols] = cat_imputer.transform(test[cat_cols])

### Step 6: Encoding Categorical Features
We will use one-hot encoding.

In [None]:
train_encoded = pd.get_dummies(train, drop_first=True)
test_encoded = pd.get_dummies(test, drop_first=True)

# Align columns
X = train_encoded.drop(columns=['SalePrice', 'Id'])
y = train_encoded['SalePrice']
X_test = test_encoded.reindex(columns=X.columns, fill_value=0)

### Step 7: Feature Scaling
We will standardize the features.

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

### Step 8: Train Models
We will try different models and compare their performance.

In [None]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.01),
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
}

results = []
for name, model in models.items():
    model.fit(X_scaled, y)
    preds = model.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)
    results.append({'Model': name, 'RMSE': round(rmse, 2), 'R2': round(r2, 3)})

pd.DataFrame(results)

### Step 9: Extended Regression Models with Evaluation
Now we'll evaluate several regression techniques including:
- Linear Regression
- Polynomial Regression (degree 2)
- Lasso Regression
- Ridge Regression

We'll evaluate using:
- RMSE on training data
- R² on training data
- 5-fold Cross-Validated RMSE

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

models = {
    'Linear Regression': LinearRegression(),
    'Polynomial Regression (deg 2)': Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('linreg', LinearRegression())
    ]),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.01)
}

results = []
for name, model in models.items():
    model.fit(X_scaled, y)
    preds = model.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)
    cv_rmse = -cross_val_score(model, X_scaled, y, cv=5, scoring='neg_root_mean_squared_error').mean()
    results.append({
        'Model': name,
        'Train RMSE': round(rmse, 2),
        'Train R²': round(r2, 3),
        'CV RMSE': round(cv_rmse, 2)
    })

pd.DataFrame(results).sort_values(by='CV RMSE')

### Step 10: Feature Correlation & Sanity Checks
We will perform sanity checks and analyze correlations between features.
- High correlation between features may introduce multicollinearity
- We'll use a heatmap to identify such cases and drop redundant features

In [None]:
# Compute correlation matrix
corr_matrix = pd.DataFrame(X_scaled, columns=X.columns).corr()

# Plot heatmap of top correlated features
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0, square=True, cbar_kws={'shrink': .5})
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Drop features with correlation > 0.95
threshold = 0.95
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper.columns if any(upper[column].abs() > threshold)]
print("Dropping correlated features:", high_corr_features)
X_reduced = pd.DataFrame(X_scaled, columns=X.columns).drop(columns=high_corr_features)

### Step 11: Dimensionality Reduction (Optional PCA)
PCA helps to reduce dimensionality while preserving variance.
We will check how many components explain 95% of the variance.

In [None]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_reduced)
print(f"Original shape: {X_reduced.shape}, PCA shape: {X_pca.shape}")

# Plot explained variance
plt.figure(figsize=(8, 4))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA - Explained Variance')
plt.grid(True)
plt.show()

### Step 12: Feature Importance (from Random Forest)
Let's visualize which features are most important using a trained Random Forest model.

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_reduced, y)
importances = pd.Series(rf.feature_importances_, index=X_reduced.columns)
top_features = importances.sort_values(ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_features.values, y=top_features.index)
plt.title('Top 15 Feature Importances (Random Forest)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()