In [56]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

In [57]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [58]:
target = train_data['SalePrice']
train_data = train_data.drop(columns=['SalePrice'])

In [59]:
num_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
num_imputer = SimpleImputer(strategy='median')
train_data[num_cols] = num_imputer.fit_transform(train_data[num_cols])
test_data[num_cols] = num_imputer.transform(test_data[num_cols])

In [60]:
cat_cols = train_data.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
train_data[cat_cols] = cat_imputer.fit_transform(train_data[cat_cols])
test_data[cat_cols] = cat_imputer.transform(test_data[cat_cols])

In [61]:
train_data = pd.get_dummies(train_data, columns=cat_cols, drop_first=True)
test_data = pd.get_dummies(test_data, columns=cat_cols, drop_first=True)

In [62]:
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

In [63]:
train_data = train_data.drop(columns=['Id'])
test_data = test_data.drop(columns=['Id'])

In [64]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data, target, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = val_mse ** 0.5

print("Validation RMSE:", val_rmse)

Validation RMSE: 28871.65828122992


In [65]:
# Make predictions on the test set
test_predictions = model.predict(test_data)

# Create a submission dataframe
submission = pd.DataFrame({
    'Id': test_data.index + 1,  # Adjust this if your Id column is different
    'SalePrice': test_predictions
})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

In [66]:
from google.colab import files

files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>