# USCensus Model Inference

This notebook performs inference of the various fitted RFR models on the different cleaned versions of a test set for train-test split number 0 on USCensus data. The results for different train-test splits can be calculated by changing the 'train_test_split_num' variable.

To test other model types, 'rfr' can be changed to 'xgb' or 'gbr'.

In [None]:
import warnings
import numpy as np 
import pandas as pd
from sklearn.metrics import r2_score
import joblib
from joblib import Parallel, delayed

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
train_test_split_num = 0

testing_datasets = pd.read_pickle('uscensus_cleaned_test_df_' + str(train_test_split_num) + '.pkl')

compressed_models = dict()

for i in range(len(testing_datasets)):
    loaded_rfr = joblib.load('uscensus_rfr_' + str(train_test_split_num) + '_' + str(i) + '_compressed.joblib')
    compressed_models['uscensus_rfr_' + str(train_test_split_num) + '_' + str(i)] = loaded_rfr

def process_dataset(i, testing_data, compressed_models):
    new_model_accuracy = pd.DataFrame(columns=['model', 'test_cleaning', 'train_cleaning', 'pred_vals', 'true_vals', 'score'])

    XY_test_pro = testing_data.dataset

    XY_test_pro = XY_test_pro.drop('Education-num', axis = 1)
  
    Y_test_pro = XY_test_pro["Age"]
    X_test_pro = XY_test_pro.drop('Age', axis=1)

    X_test_pro = pd.get_dummies(X_test_pro)
    X_test_pro = X_test_pro.rename(columns={"Income_<=50k": "Income_less_50k", "Income_>50k": "Income_greater_50k"})

    for model_name in compressed_models:
        model = compressed_models[model_name]

        cols_when_model_builds = model.feature_names_in_

        X_test_upd = X_test_pro.copy()

        missing_features = pd.DataFrame(0, index=X_test_upd.index,
                                        columns=[x for x in model.feature_names_in_ if x not in X_test_upd.columns])
        X_test_upd = pd.concat([X_test_upd, missing_features], axis=1)
        
        X_test_upd = X_test_upd[cols_when_model_builds]

        pred_model = model.predict(X_test_upd)
        new_model_accuracy = new_model_accuracy.append({'model': model_name, 'test_cleaning': i, 'train_cleaning': model_name[model_name.rfind('_')+1:],
                                                        'pred_vals': pred_model.round(3),
                                                         'true_vals': Y_test_pro.values.round(3), 'score': r2_score(Y_test_pro, pred_model)}
                                                        , ignore_index=True)

    return new_model_accuracy

In [None]:
# Parallel processing of testing datasets
num_cores = joblib.parallel.cpu_count()
results = Parallel(n_jobs=num_cores)(
    delayed(process_dataset)(i, testing_datasets.loc[i], compressed_models) for i in range(len(testing_datasets)))

# Combine the results into a single DataFrame
final_result = pd.concat(results, ignore_index=True)

# Save the final result to pickle file
final_result.to_pickle('uscensus_rfr_' + str(train_test_split_num) + '_results.pkl')