In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import r2_score
from scipy import stats
import numpy as np

# Load the dataset
house_price = pd.read_csv('train.csv')
house_price.to_csv('house_price.csv', index=False)

# 1. Number of columns
num_columns = house_price.shape[1]
print(f"Number of columns: {num_columns}")

# 2. Missing values
missing_values = house_price.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Separate numerical and categorical columns
numerical_cols = house_price.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = house_price.select_dtypes(include=['object']).columns

# Fill missing values for numerical columns with the mean
house_price[numerical_cols] = house_price[numerical_cols].fillna(house_price[numerical_cols].mean())

# Fill missing values for categorical columns with the mode (modified to avoid the warning)
for col in categorical_cols:
    house_price[col] = house_price[col].fillna(house_price[col].mode()[0])

# Verify if all missing values have been handled
missing_values_after = house_price.isnull().sum()
print("Missing values after handling:\n", missing_values_after)

# 3. Encoding Categorical Variables
for col in categorical_cols:
    if house_price[col].nunique() <= 10:  # Example threshold for choosing One-Hot encoding
        house_price = pd.get_dummies(house_price, columns=[col], drop_first=True)
    else:
        le = LabelEncoder()
        house_price[col] = le.fit_transform(house_price[col])

# 4. Scaling Numerical Features
scaler = MinMaxScaler()
house_price[numerical_cols] = scaler.fit_transform(house_price[numerical_cols])

# 5. Splitting the Dataset
X = house_price.drop('SalePrice', axis=1)
y = house_price['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Ridge Regression Model
ridge_model = Ridge(alpha=10)
ridge_model.fit(X_train, y_train)

# 7. Evaluating Ridge Regression
y_pred_ridge = ridge_model.predict(X_test)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"R² score for Ridge Regression: {r2_ridge}")

# 8. Lasso Regression Model
lasso_model = Lasso(alpha=10)
lasso_model.fit(X_train, y_train)

# 9. Evaluating Lasso Regression
y_pred_lasso = lasso_model.predict(X_test)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"R² score for Lasso Regression: {r2_lasso}")

# 10. R² Score Comparison (assuming MLR score is available)
r2_mlr = 0.75  # From previous analysis
print(f"R² score comparison:")
print(f"Ridge Regression: {r2_ridge}")
print(f"Lasso Regression: {r2_lasso}")
print(f"Multiple Linear Regression: {r2_mlr}")

# 11. Ridge and Lasso Regression with Cross-Validation for Optimal Alpha
alphas = [0.1, 1, 10, 100]

# RidgeCV for optimal alpha
ridge_cv = RidgeCV(alphas=alphas)
ridge_cv.fit(X_train, y_train)
optimal_alpha_ridge = ridge_cv.alpha_

# LassoCV for optimal alpha
lasso_cv = LassoCV(alphas=alphas, cv=5)
lasso_cv.fit(X_train, y_train)
optimal_alpha_lasso = lasso_cv.alpha_

# Print optimal alphas
print(f"Optimal alpha for Ridge: {optimal_alpha_ridge}")
print(f"Optimal alpha for Lasso: {optimal_alpha_lasso}")

# 12. Outlier Removal using Z-Scores
numerical_cols_train = X_train.select_dtypes(include=[np.number])
z_scores = np.abs(stats.zscore(numerical_cols_train))
outliers = (z_scores > 3).any(axis=1)

X_train_cleaned = X_train[~outliers]
y_train_cleaned = y_train[~outliers]

# Retrain Ridge and Lasso with optimized data
ridge_optimized = Ridge(alpha=optimal_alpha_ridge)
ridge_optimized.fit(X_train_cleaned, y_train_cleaned)

lasso_optimized = Lasso(alpha=optimal_alpha_lasso)
lasso_optimized.fit(X_train_cleaned, y_train_cleaned)

# 13. Optimized Model Evaluation
r2_ridge_optimized = r2_score(y_test, ridge_optimized.predict(X_test))
r2_lasso_optimized = r2_score(y_test, lasso_optimized.predict(X_test))

# Print the optimized R² scores
print(f"Optimized R² score for Ridge Regression: {r2_ridge_optimized}")
print(f"Optimized R² score for Lasso Regression: {r2_lasso_optimized}")


Number of columns: 81
Missing values in each column:
 Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64
Missing values after handling:
 Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64
R² score for Ridge Regression: 0.8365697082847783
R² score for Lasso Regression: -0.0008824918802494697
R² score comparison:
Ridge Regression: 0.8365697082847783
Lasso Regression: -0.0008824918802494697
Multiple Linear Regression: 0.75
Optimal alpha for Ridge: 10.0
Optimal alpha for Lasso: 0.1
Optimized R² score for Ridge Regression: 0.8142332448284711
Optimized R² score for Lasso Regression: 0.011807146042154382
