In [1]:
import os

# Get the directory containing the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# Change working directory to notebook folder
os.chdir(notebook_dir)

# Verify
print("Current working directory:", os.getcwd())

Current working directory: /home/j/jl1416/sta521/521PredictionProject-1


In [2]:
import pandas as pd
from sklearn.preprocessing import TargetEncoder  
from xgboost import XGBClassifier

# -----------------------------
# Load cleaned data
# -----------------------------
data = pd.read_csv('data/cleaned/evaluation_cleaned.csv')

ids = data['building_id']
X = data.drop(columns=['building_id'])

# Features to target‐encode
geo_target = ['geo__geo_level_2_id', 'geo__geo_level_3_id']

In [3]:
import pickle
# Load final XGBoost model to extract the parameters
with open("artifacts/final_model_xgb.pkl", "rb") as f:
    model = pickle.load(f)

best_params = model.get_params()
print(best_params)

with open("artifacts/final_target_encoder.pkl", "rb") as f:
    te_final = pickle.load(f)

{'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.5025969909985456, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'mlogloss', 'feature_types': None, 'feature_weights': None, 'gamma': 0.41127960629317606, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.05663399296149698, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 9, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 1258, 'n_jobs': 1, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': 1.0712376905472907, 'reg_lambda': 0.7143681038483615, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.8519852631186717, 'tree_method': None, 'validate_parameters': None, 'verbosity'

In [4]:
# -----------------------------
# Transform evaluation data based on trained transformer
# -----------------------------
X_enc = te_final.transform(X[geo_target])

X_enc = pd.DataFrame(X_enc, index=X.index, columns=te_final.get_feature_names_out(geo_target))

X_full = X.copy()
X_full.drop(columns=geo_target, inplace=True)

X_full = pd.concat([X_full, X_enc], axis=1)

# -----------------------------
# Generate predictions
# -----------------------------
y_pred = model.predict(X_full) + 1

In [5]:
import numpy as np

unique_values, counts = np.unique(y_pred, return_counts=True)

print(unique_values)  # [1 2 3 4]
print(counts)       

[1 2 3]
[ 4544 39309 16748]


In [6]:
df_preds = pd.DataFrame({'Building ID':ids,
                          'Prediction':y_pred})

In [7]:
df_preds

Unnamed: 0,Building ID,Prediction
0,28830,2
1,94947,3
2,333020,2
3,728451,2
4,441126,2
...,...,...
60596,858025,1
60597,695987,2
60598,827012,2
60599,688636,2


In [8]:
df_preds.to_csv("data/preds/predictions.csv", index=False)