In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error


# Load datasets
train = pd.read_csv('/train (1).csv')
test = pd.read_csv('/test (1).csv')
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Separate target
y = train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace=True)

# Combine datasets
combined = pd.concat([train, test], keys=['train', 'test'])
print("Combined shape:", combined.shape)


# Fill missing values
# Fill categorical NAs with 'None'
cat_features = combined.select_dtypes(include=['object']).columns
combined[cat_features] = combined[cat_features].fillna('None')

# Fill numerical NAs with median
num_features = combined.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
combined[num_features] = imputer.fit_transform(combined[num_features])


# Feature engineering
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']

# Create age features
combined['HouseAge'] = combined['YrSold'] - combined['YearBuilt']
combined['RemodAge'] = combined['YrSold'] - combined['YearRemodAdd']
combined['GarageAge'] = combined['YrSold'] - combined['GarageYrBlt']
combined.drop(['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1, inplace=True)


# Ordinal mapping for quality ratings
qual_map = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'None':0}
for col in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
    combined[col] = combined[col].map(qual_map)


# Convert MSSubClass to string
combined['MSSubClass'] = combined['MSSubClass'].astype(str)

# One-hot encode categorical variables
combined = pd.get_dummies(combined)

# Scale numeric features (recalculate num_features AFTER dropping columns)
num_features = combined.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
combined[num_features] = scaler.fit_transform(combined[num_features])


# Split combined back into train and test
train_processed = combined.xs('train')
test_processed = combined.xs('test')
train_processed['SalePrice'] = y

# Save preprocessed data
print("Preprocessing complete.")
train_processed.to_csv('/content/train_preprocessed.csv', index=False)
test_processed.to_csv('/content/test_preprocessed.csv', index=False)
print("Saved train_preprocessed.csv & test_preprocessed.csv")


# Prepare data for modeling
X = train_processed.drop('SalePrice', axis=1)
y = train_processed['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_val)
lr_rmse = np.sqrt(mean_squared_error(y_val, lr_preds))
print(f"Linear Regression RMSE: {lr_rmse:.2f}")


# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_val)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_preds))
print(f"Random Forest RMSE: {rf_rmse:.2f}")


# XGBoost
xgb = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_preds))
print(f"XGBoost RMSE: {xgb_rmse:.2f}")

# Cross-validation
cv_scores = cross_val_score(xgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"XGBoost CV RMSE: {-cv_scores.mean():.2f}")

# Final predictions
final_preds = xgb.predict(test_processed)
submission = pd.read_csv('/sample_submission.csv')
submission['SalePrice'] = final_preds
submission.to_csv('/sample_submission.csv', index=False)
print("submission.csv saved for kaggle!")


Train shape: (1460, 81)
Test shape: (1459, 80)
Combined shape: (2919, 80)
Preprocessing complete.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_processed['SalePrice'] = y


Saved train_preprocessed.csv & test_preprocessed.csv
Linear Regression RMSE: 30147.53
Random Forest RMSE: 30130.75
XGBoost RMSE: 26834.05
XGBoost CV RMSE: 27914.25
submission.csv saved for kaggle!


In [50]:
from google.colab import files
files.download('/sample_submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>