In [9]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error

# Load training data
train_data = pd.read_csv('dataset/train.csv')

# Define the target variable
y = train_data.SalePrice 

# Define the features
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = train_data[features]

# Split the data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.25, random_state=1)

# Initialize the model
model = RandomForestRegressor(random_state=1)

# Fit the model
model.fit(train_X, train_y)

# Make predictions
val_predict = model.predict(val_X)

# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(val_predict, val_y)
print(f"MAE for validation is {mae:,.0f}")


MAE for validation is 21,857


In [None]:
# Initialize and fit the model using the entire dataset
model_full_data = RandomForestRegressor(random_state=1)
model_full_data.fit(X, y)


In [None]:
# Load test data
test_data = pd.read_csv('dataset/test.csv')

# Select features for the test set
test_X = test_data[features]

# Make predictions on the test set
test_predictions = model_full_data.predict(test_X)

# Create a DataFrame with the test IDs and the predicted sale prices
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_predictions})

# Save the predictions to a CSV file
output.to_csv('dataset/output.csv', index=False)

# Print summary statistics of the predictions
print(output.describe())
