In [None]:
import pandas as pd
import joblib
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Load data
train_data = pd.read_csv('../data/processed/processed_train_data.csv').copy()
test_data_processed_raw = pd.read_csv('../data/processed/processed_test_data.csv').copy()
sample_submission = pd.read_csv('../data/raw/sample_submission.csv').copy()

# Load the trained model
stacked_extra_trees_regressor_model = joblib.load('../models/stacked_extra_trees_regressor_model.pkl')

# Define top 20 features
top_20_features = [
    'Overall_Quality_Impact', 'Overall_Quality', 'Neighborhood_avg_price', 'TotalBsmtSF',
    'BsmtQual_to_BsmtFinSF', 'TotalOutdoorArea', 'OverallQual', '2ndFlrSF', '1stFlrSF', 
    'BsmtFinSF1', 'YearBuilt', 'LotArea', 'Garage_Capacity_per_Square_Meter', 
    'FireplaceQu_OverallQuality_Interaction', 'GrLivArea', 'GarageCars', 'OpenPorchSF', 
    'PavedDrive_LotFrontage_Interaction', 'BsmtUnfSF', 'YearRemodAdd'
]

# Select only the top 25 features from the test dataset
test_data = test_data_processed_raw[top_20_features]

# Handle missing values: Impute NaNs with the mean value of each column
imputer = SimpleImputer(strategy='mean')
test_data_imputed = imputer.fit_transform(test_data)

# Predict SalePrice using the loaded model
predictions = stacked_extra_trees_regressor_model.predict(test_data_imputed)

# Add the predictions to the sample_submission dataframe
sample_submission['SalePrice'] = predictions

# Save the results to a new CSV file
sample_submission.to_csv('../submission/test_predictions.csv', index=False)

print("Predictions have been saved successfully.")

In [None]:
sample_submission.columns

In [None]:
sample_submission = pd.read_csv('../submission/test_predictions.csv', index_col=0)

if 'Unnamed: 0' in sample_submission.columns:
    sample_submission = sample_submission.drop(columns=['Unnamed: 0'])


In [None]:
sample_submission.columns

In [None]:
sample_submission.to_csv('../submission/test_predictions.csv', index=False)