In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load the training data
train_data = pd.read_csv(r'C:\Users\jorda\Documents\studies\DScourse\CourseMaterials\Data\home-data-for-ml-course\train.csv')

# Stage 1: Encode Categorical Variables
categorical_cols = [col for col in train_data.columns if train_data[col].dtype == 'object']

# Applying One-Hot Encoding
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_vars = onehot_encoder.fit_transform(train_data[categorical_cols])
encoded_vars_df = pd.DataFrame(encoded_vars, columns=onehot_encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded variables
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, encoded_vars_df], axis=1)

# Stage 2: Handle Missing Values
# Imputing missing values for numerical and categorical data
for col in train_data.columns:
    if train_data[col].isnull().any():
        imputer = SimpleImputer(strategy='median' if train_data[col].dtype in ['int64', 'float64'] else 'most_frequent')
        train_data[col] = imputer.fit_transform(train_data[[col]])

# Stage 3: Deal with Outliers in 'LotArea'
Q1 = train_data['LotArea'].quantile(0.25)
Q3 = train_data['LotArea'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Removing outliers
train_data = train_data[(train_data['LotArea'] >= lower_bound) & (train_data['LotArea'] <= upper_bound)]

# Preparing data for Random Forest model
X = train_data.drop(['SalePrice', 'Id'], axis=1)  # Assuming 'SalePrice' is the target and 'Id' is an identifier
y = train_data['SalePrice']

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X, y)

# Load the test data
test_data = pd.read_csv(r'C:\Users\jorda\Documents\studies\DScourse\CourseMaterials\Data\home-data-for-ml-course\test.csv')
test_data_original = test_data.copy()  # Copy to preserve original IDs

# Repeat preprocessing for test data
test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)
test_data = test_data.reindex(columns = X.columns, fill_value=0)

# Impute missing values in the test data
for col in test_data.columns:
    if test_data[col].isnull().any():
        imputer = SimpleImputer(strategy='median' if test_data[col].dtype in ['int64', 'float64'] else 'most_frequent')
        test_data[col] = imputer.fit_transform(test_data[[col]])

# Make predictions on the test data
predictions = model.predict(test_data)

# Prepare submission file
submission = pd.DataFrame({
    'Id': test_data_original['Id'],
    'SalePrice': predictions
})

# Save submission file
submission.to_csv('submission_stage1-3.csv', index=False)

