In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

# Load the datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# Separate target from predictors in the training dataset
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = np.log1p(train_df['SalePrice'])  # Log transformation to normalize the target variable
X_test = test_df.drop(['Id'], axis=1)

# Identifying numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model using XGBRegressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=3,
        subsample=0.7,
        colsample_bytree=0.7,
        random_state=0,
        objective='reg:squarederror'))
])

# Cross-validation to evaluate the model
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
print(f'RMSE Scores: {rmse_scores}')
print(f'Average RMSE: {np.mean(rmse_scores)}')

# Fit the model to the training data
model.fit(X, y)

# Make predictions on the test set
log_predictions = model.predict(X_test)
predictions = np.expm1(log_predictions)  # Reverting log transformation

# Create submission file
submission = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': predictions})
submission_filename = 'House_prices_submission_new2.csv'
submission.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' has been created.")


RMSE Scores: [0.11051181 0.13645081 0.12972704 0.10780001 0.11833545]
Average RMSE: 0.12056502634834339
Submission file 'House_prices_submission_new2.csv' has been created.


In [2]:
from google.colab import files
files.download('House_prices_submission_new2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>