In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load data
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)

# Check missing values
missing_values = train.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print("Missing values:\n", missing_values)

# Drop columns with more than 80% missing values
drop_cols = missing_values[missing_values > 0.8 * len(train)].index
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

# Separate categorical and numeric columns (train only)
cat_cols = train.select_dtypes(include='object').columns
num_cols = train.select_dtypes(exclude='object').columns

# Fill missing values - Categorical
for col in cat_cols:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    if col in test.columns:
        test[col] = test[col].fillna(mode)

# Fill missing values - Numeric
for col in num_cols:
    median = train[col].median()
    train[col] = train[col].fillna(median)
    if col in test.columns:
        test[col] = test[col].fillna(median)

# Feature engineering
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

train['HouseAge'] = train['YrSold'] - train['YearBuilt']
test['HouseAge'] = test['YrSold'] - test['YearBuilt']

# Label Encoding with same mapping for both train and test
encoder = LabelEncoder()
for col in cat_cols:
    train[col] = encoder.fit_transform(train[col])
    if col in test.columns:
        test[col] = test[col].map(lambda s: encoder.transform([s])[0] if s in encoder.classes_ else -1)

# Standard Scaling (excluding Id and SalePrice)
features_to_scale = [col for col in num_cols if col not in ['Id', 'SalePrice']]
features_to_scale += ['TotalSF', 'HouseAge']  # include new features

scaler = StandardScaler()
train[features_to_scale] = scaler.fit_transform(train[features_to_scale])
test[features_to_scale] = scaler.transform(test[features_to_scale])

# Final status
print("Final Train Shape:", train.shape)
print("Final Test Shape:", test.shape)
print("Train Columns:\n", train.columns)

# Final datasets
X = train.drop(['Id', 'SalePrice'], axis=1)
y = train['SalePrice']


Train Shape: (1460, 81)
Test Shape: (1459, 80)
Missing values:
 PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64
Final Train Shape: (1460, 79)
Final Test Shape: (1459, 78)
Train Columns:
 Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond

In [10]:
# Import model and evaluation libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Step 1: Train-test split from training data for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Step 3: Predict on validation set and evaluate
val_preds = lr_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse:.2f}")

# Step 4: Train on full training data and predict on test set
lr_model.fit(X, y)
test_preds = lr_model.predict(test.drop(['Id'], axis=1))

# Step 5: Create submission DataFrame
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_preds
})




Validation RMSE: 34286.97
