In [110]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, recall_score, r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, StratifiedKFold, cross_validate, KFold, GridSearchCV, ParameterGrid, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import BaggingRegressor,BaggingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer


In [111]:
df = pd.read_csv('clean_data.csv')

df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Year'] = df['Start Date'].dt.year
df['Month'] = df['Start Date'].dt.month
df = df.groupby(['Year', 'Month', 'Geographic Cluster Name']).size().reset_index(name='Program Count')
df.sort_values(by=['Year', 'Month'])


  df = pd.read_csv('clean_data.csv')


Unnamed: 0,Year,Month,Geographic Cluster Name,Program Count
0,2020,1,NEAR WEST SIDE,2
1,2020,1,NORTH LAWNDALE,2
2,2020,6,GRAND BOULEVARD,2
3,2020,6,KENWOOD,2
4,2020,6,LINCOLN SQUARE,1
...,...,...,...,...
2281,2023,8,LOWER WEST SIDE,2
2282,2023,9,HERMOSA,2
2283,2023,9,NEAR WEST SIDE,5
2284,2023,10,HERMOSA,2


In [103]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000],
    'subsample': [0.8, 0.9, 1.0]
}

model = xgb.XGBRegressor()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, n_jobs = -1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_


Unnamed: 0,Month,50,53,29,37,40,56,62,0,1,...,74,75,76,77,11,70,34,57,13,12
0,2024-01,124.151566,72.141914,16.302088,39.473606,108.090355,232.160751,60.75087,32.306793,30.939243,...,62.566322,73.092323,128.143661,33.401825,24.162777,23.698652,4.453276,2.74439,14.386921,0.979037
1,2024-02,60.802795,19.508095,6.340447,19.218632,51.29784,59.173088,5.63489,14.396987,8.645192,...,19.802315,11.155689,15.6215,3.006507,12.87137,23.698652,1.91628,2.74439,2.910158,0.979037
2,2024-03,92.910873,27.547321,5.296434,22.304035,50.25703,59.250168,4.99903,21.616137,14.191189,...,24.478052,23.994167,17.086746,2.619292,11.905905,26.651461,2.089723,2.74439,2.894731,0.979037
3,2024-04,191.440338,61.645393,8.943563,48.492828,113.937683,180.032745,50.317074,45.807617,29.503061,...,62.454124,49.333256,129.693222,22.107607,19.254374,26.651461,6.042068,2.74439,10.415839,0.979037
4,2024-05,80.976357,30.624981,8.644614,10.555427,20.419764,24.756926,14.349779,14.957205,23.104578,...,50.945869,7.233886,21.333294,9.326042,14.741533,2.738594,1.657183,2.74439,8.035288,0.979037
5,2024-06,65.157532,42.334011,11.72945,26.457794,58.641205,36.066391,28.876619,18.294992,23.104578,...,53.866848,10.955441,41.500004,13.097939,30.815485,3.450879,2.889872,2.74439,8.939391,0.979037
6,2024-07,65.517395,43.95266,12.276443,30.777803,60.89637,119.27935,46.049507,31.135403,25.660046,...,57.080486,41.215324,46.90004,14.830243,31.091089,3.499571,7.090826,2.74439,8.939391,0.979037
7,2024-08,48.254784,50.237011,12.61857,30.092913,66.505852,131.951447,43.49316,30.982811,25.084955,...,56.459126,48.544933,63.640087,22.065981,30.607962,4.831064,4.628077,2.74439,8.795937,0.979037
8,2024-09,66.079369,55.377522,12.555725,33.684273,114.699272,140.434204,57.660034,44.267117,25.784168,...,58.76804,65.009026,103.143166,26.218794,36.964283,20.297367,9.489142,2.74439,10.301839,0.979037
9,2024-10,21.551073,27.830086,13.344534,19.733091,76.42527,77.206184,21.557518,23.545036,13.545579,...,46.00486,27.580379,32.453285,5.343553,26.515186,6.62104,1.430444,2.74439,10.301839,0.979037


In [113]:
le_cluster = LabelEncoder()
df['Geographic Cluster Name'] = le_cluster.fit_transform(df['Geographic Cluster Name'])

months_2024 = pd.date_range(start='2024-01-01', end='2024-12-31', freq='M')
unique_clusters = df['Geographic Cluster Name'].unique()

prediction_df = pd.DataFrame({'Month': months_2024.strftime('%Y-%m')})
for cluster in unique_clusters:
    X_predict_cluster = pd.DataFrame({'Year': months_2024.year, 'Month': months_2024.month})
    X_train_cluster = df.loc[df['Geographic Cluster Name'] == cluster, ['Year', 'Month']]
    y_train_cluster = df.loc[df['Geographic Cluster Name'] == cluster, 'Program Count']

    model = xgb.XGBRegressor(**best_params)
    model.fit(X_train_cluster, y_train_cluster)
    
    predictions_cluster = model.predict(X_predict_cluster)
    temp_df = pd.DataFrame({le_cluster.inverse_transform([cluster])[0]: predictions_cluster})
    prediction_df = pd.concat([prediction_df, temp_df], axis=1)

prediction_df

Unnamed: 0,Month,NEAR WEST SIDE,NORTH LAWNDALE,GRAND BOULEVARD,KENWOOD,LINCOLN SQUARE,OAKLAND,ROSELAND,ALBANY PARK,ARCHER HEIGHTS,...,WEST PULLMAN,WEST RIDGE,WEST TOWN,WOODLAWN,BRIGHTON PARK,WEST ELSDON,HYDE PARK,OHARE,CALUMET HEIGHTS,BURNSIDE
0,2024-01,124.151566,72.141914,16.302088,39.473606,108.090355,232.160751,60.75087,32.306793,30.939243,...,62.566322,73.092323,128.143661,33.401825,24.162777,23.698652,4.453276,2.74439,14.386921,0.979037
1,2024-02,60.802795,19.508095,6.340447,19.218632,51.29784,59.173088,5.63489,14.396987,8.645192,...,19.802315,11.155689,15.6215,3.006507,12.87137,23.698652,1.91628,2.74439,2.910158,0.979037
2,2024-03,92.910873,27.547321,5.296434,22.304035,50.25703,59.250168,4.99903,21.616137,14.191189,...,24.478052,23.994167,17.086746,2.619292,11.905905,26.651461,2.089723,2.74439,2.894731,0.979037
3,2024-04,191.440338,61.645393,8.943563,48.492828,113.937683,180.032745,50.317074,45.807617,29.503061,...,62.454124,49.333256,129.693222,22.107607,19.254374,26.651461,6.042068,2.74439,10.415839,0.979037
4,2024-05,80.976357,30.624981,8.644614,10.555427,20.419764,24.756926,14.349779,14.957205,23.104578,...,50.945869,7.233886,21.333294,9.326042,14.741533,2.738594,1.657183,2.74439,8.035288,0.979037
5,2024-06,65.157532,42.334011,11.72945,26.457794,58.641205,36.066391,28.876619,18.294992,23.104578,...,53.866848,10.955441,41.500004,13.097939,30.815485,3.450879,2.889872,2.74439,8.939391,0.979037
6,2024-07,65.517395,43.95266,12.276443,30.777803,60.89637,119.27935,46.049507,31.135403,25.660046,...,57.080486,41.215324,46.90004,14.830243,31.091089,3.499571,7.090826,2.74439,8.939391,0.979037
7,2024-08,48.254784,50.237011,12.61857,30.092913,66.505852,131.951447,43.49316,30.982811,25.084955,...,56.459126,48.544933,63.640087,22.065981,30.607962,4.831064,4.628077,2.74439,8.795937,0.979037
8,2024-09,66.079369,55.377522,12.555725,33.684273,114.699272,140.434204,57.660034,44.267117,25.784168,...,58.76804,65.009026,103.143166,26.218794,36.964283,20.297367,9.489142,2.74439,10.301839,0.979037
9,2024-10,21.551073,27.830086,13.344534,19.733091,76.42527,77.206184,21.557518,23.545036,13.545579,...,46.00486,27.580379,32.453285,5.343553,26.515186,6.62104,1.430444,2.74439,10.301839,0.979037


In [115]:
prediction_df.iloc[:, 1:] = prediction_df.iloc[:, 1:].apply(lambda x: x.clip(lower=0))
prediction_df.to_csv('prediction_xgb.csv', index=False)
