In [None]:

import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')  # Replace with your actual file name if it's different

# Check shape and first few rows
print("Shape of the dataset:", df.shape)
df.head()


In [None]:
# Check missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print("Columns with missing values:\n", missing_values)

# Check data types
print("\nData types of columns:\n")
print(df.dtypes.value_counts())
df.dtypes


In [None]:
# Drop 'Id' since it doesn't carry predictive information
df.drop(columns=['Id'], inplace=True)


In [None]:
# Target column
target = 'SalePrice'

In [None]:
# Separate features by datatype
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()


In [None]:
numerical_cols.remove(target)

In [None]:
print("Numerical features:", len(numerical_cols))
print("Categorical features:", len(categorical_cols))

In [None]:
# Fill missing numerical values with median
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)
# Fill missing categorical values with 'Missing'
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna('Missing', inplace=True)


In [None]:
# Check if any missing values remain
print("Any missing values left?", df.isnull().sum().sum() > 0)

In [None]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("New shape after encoding:", df_encoded.shape)


In [None]:
from sklearn.preprocessing import StandardScaler

# Separate features and target
X = df_encoded.drop('SalePrice', axis=1)
y = df_encoded['SalePrice']

# Standardize the feature matrix
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Shape after scaling:", X_scaled.shape)


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Apply PCA without limiting components
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs Number of Components')
plt.grid(True)
plt.show()


In [None]:
# Keep 95% of variance
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_scaled)

print("Reduced shape after PCA:", X_reduced.shape)


# **Linear Regression with PCA**


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Train model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")


# **Random Forest with PCA**

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train on PCA features
rf.fit(X_train, y_train)

# Predict
y_pred_rf = rf.predict(X_test)

# Evaluate
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {rmse_rf:.2f}")
print(f"Random Forest R² Score: {r2_rf:.4f}")


# **XGBOOST WITH PCA**

In [None]:
pip install xgboost


In [None]:
import xgboost as xgb

# Initialize model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train on PCA features
xgb_model.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {rmse_xgb:.2f}")
print(f"XGBoost R² Score: {r2_xgb:.4f}")


# **Random Forest Regressor without PCA**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Split data (no PCA)
X_raw = df_encoded.drop('SalePrice', axis=1)
y_raw = df_encoded['SalePrice']

# Train-test split
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42)

# Train Random Forest on raw features
rf_raw = RandomForestRegressor(n_estimators=100, random_state=42)
rf_raw.fit(X_train_raw, y_train_raw)

# Predict
y_pred_raw = rf_raw.predict(X_test_raw)

# Evaluate
rmse_raw = np.sqrt(mean_squared_error(y_test_raw, y_pred_raw))
r2_raw = r2_score(y_test_raw, y_pred_raw)

print(f"Random Forest (No PCA) RMSE: {rmse_raw:.2f}")
print(f"Random Forest (No PCA) R² Score: {r2_raw:.4f}")


**Log-Transform the Target Variable** 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Check original distribution
sns.histplot(df_encoded['SalePrice'], kde=True)
plt.title("Original SalePrice Distribution")
plt.show()

# Apply log1p (log(1 + x)) to SalePrice
df_encoded['SalePrice_log'] = np.log1p(df_encoded['SalePrice'])

# Check new distribution
sns.histplot(df_encoded['SalePrice_log'], kde=True, color='orange')
plt.title("Log-Transformed SalePrice Distribution")
plt.show()


**Feature Engineering**

In [None]:
# First calculate 'Age' if it doesn't exist
df_encoded['Age'] = df_encoded['YrSold'] - df_encoded['YearBuilt']

# Now safely bin it
df_encoded['HouseAgeBin'] = pd.cut(
    df_encoded['Age'], 
    bins=[0, 10, 50, 100, 150], 
    labels=["New", "Mid", "Old", "Historic"]
)


# **USING LIGHT GBM** 

In [None]:
pip install lightgbm


In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Prepare data
X_lgb = df_encoded.drop(['SalePrice', 'SalePrice_log'], axis=1)
y_lgb = df_encoded['SalePrice_log']  # Log target

# Train-test split
X_train_lgb, X_test_lgb, y_train_lgb, y_test_lgb = train_test_split(X_lgb, y_lgb, test_size=0.2, random_state=42)

# Train LightGBM model
lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=-1, random_state=42)
lgb_model.fit(X_train_lgb, y_train_lgb)

# Predict and revert log transform
y_pred_lgb_log = lgb_model.predict(X_test_lgb)
y_pred_lgb_final = np.expm1(y_pred_lgb_log)
y_test_lgb_final = np.expm1(y_test_lgb)

# Evaluate
rmse_lgb = np.sqrt(mean_squared_error(y_test_lgb_final, y_pred_lgb_final))
r2_lgb = r2_score(y_test_lgb_final, y_pred_lgb_final)

print(f"LightGBM RMSE: {rmse_lgb:.2f}")
print(f"LightGBM R² Score: {r2_lgb:.4f}")


# **LightGBM Tuning** 

In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

# Split data
X_train_lgb, X_val_lgb, y_train_lgb, y_val_lgb = train_test_split(X_lgb, y_lgb, test_size=0.2, random_state=42)

# Train with fixed n_estimators
lgb_model = LGBMRegressor(
    learning_rate=0.03,
    max_depth=10,
    num_leaves=50,
    min_child_samples=10,
    n_estimators=500,  # fixed value for now
    random_state=42
)

lgb_model.fit(X_train_lgb, y_train_lgb)

# Predict and inverse log
y_val_pred_log = lgb_model.predict(X_val_lgb)
y_val_pred = np.expm1(y_val_pred_log)
y_val_true = np.expm1(y_val_lgb)

# Evaluate
rmse_tuned = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
r2_tuned = r2_score(y_val_true, y_val_pred)

print(f"\n✅ Tuned LightGBM RMSE: {rmse_tuned:.2f}")
print(f"✅ Tuned LightGBM R² Score: {r2_tuned:.4f}")


In [None]:
import joblib

# Save model
joblib.dump(lgb_model, "lightgbm_house_price_model.pkl")



In [None]:

import pandas as pd

# Load Kaggle test data
test_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# Store Ids for submission later
test_ids = test_df['Id']

# Confirm shape and features
print("Test data shape:", test_df.shape)
print("First few rows:")
print(test_df.head())


In [None]:
# Drop target from train and store original target separately
train_features = df_encoded.drop(['SalePrice', 'SalePrice_log'], axis=1)

# Concatenate train and test
combined = pd.concat([train_features, test_df], axis=0)

print("Combined shape:", combined.shape)


In [None]:
# Feature 1: Total Square Footage
test_df['TotalSF'] = test_df['1stFlrSF'] + test_df['2ndFlrSF'] + test_df['TotalBsmtSF']

# Feature 2: Total Porch Area
test_df['TotalPorchSF'] = (
    test_df['OpenPorchSF'] +
    test_df['EnclosedPorch'] +
    test_df['3SsnPorch'] +
    test_df['ScreenPorch']
)

# Feature 3: Age of the house
test_df['Age'] = test_df['YrSold'] - test_df['YearBuilt']

# Feature 4: Quality × Condition
test_df['QualityXCondition'] = test_df['OverallQual'] * test_df['OverallCond']

# Feature 5: House Age Bin
test_df['HouseAgeBin'] = pd.cut(
    test_df['Age'],
    bins=[0, 10, 50, 100, 150],
    labels=["New", "Mid", "Old", "Historic"]
)

# Fill NaNs in binned column
test_df['HouseAgeBin'] = test_df['HouseAgeBin'].cat.add_categories("Unknown").fillna("Unknown")


In [None]:
# Fill missing values using mode from training data
test_df.fillna(df_encoded.mode().iloc[0], inplace=True)

# Confirm no missing values remain
print("Remaining missing values in test_df:")
print(test_df.isnull().sum().loc[lambda x: x > 0])


In [None]:
# One-hot encode test_df
test_df_encoded = pd.get_dummies(test_df)

# Align the test set's columns with the model's training features
X_test_kaggle = test_df_encoded.reindex(columns=X_lgb.columns, fill_value=0)

# Confirm shape match
print("Test input shape after encoding and alignment:", X_test_kaggle.shape)
print("Model expected input shape:", X_lgb.shape[1])


In [None]:
# Sanity check: ensure columns match
print("Columns match:", list(X_test_kaggle.columns) == list(X_lgb.columns))
print("Test shape:", X_test_kaggle.shape)
print("Train shape:", X_lgb.shape)


In [None]:
import numpy as np
import pandas as pd

# 1. Make sure you're passing raw numpy (no column names or dtypes)
X_test_np = X_test_kaggle.values  # This strips off index/column metadata

# 2. Predict log SalePrice using LightGBM Booster model
y_test_pred_log = lgb_model.predict(X_test_np)

# 3. Reverse the log transformation
y_test_pred = np.expm1(y_test_pred_log)

# 4. Create Kaggle submission file
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": y_test_pred
})
submission.to_csv("submission.csv", index=False)

print("✅ Final submission.csv file created successfully!")
