In [3]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
train_data = pd.read_csv('/workspaces/Housing-Sales/train.csv')
test_data = pd.read_csv('/workspaces/Housing-Sales/test.csv')

# Fill missing values in the 'LotFrontage' column with the median value
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(train_data['LotFrontage'].median())
test_data['LotFrontage'] = test_data['LotFrontage'].fillna(test_data['LotFrontage'].median())

# Drop columns that have too much missing data or aren't useful for the model
train_data = train_data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, errors='ignore')
test_data = test_data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, errors='ignore')

# Convert categorical variables into dummy variables
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Ensure that the test dataset has the same columns as the training dataset
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

# Split the data into features (inputs) and target (output)
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models to train
models = {
    'RandomForest': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('model', RandomForestRegressor(n_estimators=100, random_state=42))
    ]),
    'GradientBoosting': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('model', GradientBoostingRegressor(n_estimators=100, random_state=42))
    ]),
    'HistGradientBoosting': HistGradientBoostingRegressor(random_state=42),
    'LinearRegression': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    'Ridge': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ]),
    'Lasso': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', Lasso(alpha=0.01))
    ]),
    'ElasticNet': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', ElasticNet(alpha=0.01))
    ]),
    'KNeighbors': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', KNeighborsRegressor())
    ])
}

# Train each model and calculate performance metrics
model_performance = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    model_performance[name] = {'rmse': rmse, 'r2': r2}

# Select the best model based on RMSE
best_model_name = min(model_performance, key=lambda k: model_performance[k]['rmse'])
best_model = models[best_model_name]['model']

# Predict on the test set using the best model
test_predictions = best_model.predict(test_data)

# Save the predictions to a CSV file
submission = pd.DataFrame({'Id': test_data.index + 1461, 'SalePrice': test_predictions})
submission.to_csv('/workspaces/Housing-Sales/submission.csv', index=False)


ModuleNotFoundError: No module named 'pandas'