In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data

In [2]:
path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\clean_labeled_climate_data.pkl"
df = pd.read_pickle(path)
df.shape

(470342, 125)

In [3]:
labels = pd.DataFrame(df[['Level_1', 'Level_2', 'Level_3', 'Level_4', 'ECO_NAME', 'climates_f']])
df = df.drop(columns = ['Level_1', 'Level_2', 'Level_3', 'Level_4', 'ECO_NAME', 'climates_f'])

# Train test split

In [6]:
X = df[['longitude', 'latitude', 'jan_tmin', 'annual_tmin', 'annual_meant', 'jul_maxt', 'annual_maxt']]

y = df[['jan_dptmean', 'jul_dptmean', 'annual_dptmean',
        'jan_precip', 'feb_precip', 'mar_precip', 'apr_precip', 'may_precip', 'jun_precip', 
        'jul_precip', 'aug_precip', 'sep_precip', 'oct_precip', 'nov_precip', 'dec_precip', 'annual_precip']] 

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.5)

# Modeling

## Random forest

In [10]:
rfr = MultiOutputRegressor(RandomForestRegressor())

rfr.fit(X_train, Y_train)

In [11]:
Y_pred = rfr.predict(X_test)
Y_pred = pd.DataFrame(Y_pred, columns=y.columns)

In [12]:
metrics = {}

for col in y.columns:
    rmse = np.sqrt(mean_squared_error(Y_test[col], Y_pred[col]))
    r2 = r2_score(Y_test[col], Y_pred[col])
    metrics[col] = {'RMSE': rmse, 'R²': r2}

metrics_df = pd.DataFrame(metrics).T
print(metrics_df)

                     RMSE        R²
jan_dptmean      0.153369  0.999303
jul_dptmean      0.159837  0.999412
annual_dptmean   0.136104  0.999441
jan_precip       6.786577  0.987652
feb_precip       5.632384  0.986882
mar_precip       5.555368  0.987863
apr_precip       4.067121  0.990770
may_precip       3.113565  0.993947
jun_precip       2.335357  0.997475
jul_precip       1.896810  0.998206
aug_precip       1.864187  0.998080
sep_precip       2.062243  0.997163
oct_precip       3.651093  0.991070
nov_precip       5.849168  0.988798
dec_precip       7.375561  0.987007
annual_precip   43.166459  0.991321


## XGBoost

In [None]:
rfr = MultiOutputRegressor(XGBRegressor())

rfr.fit(X_train, Y_train)

In [None]:
Y_pred = pd.DataFrame(xgb_model.predict(X_test), columns=y.columns)

metrics = {}
for col in y.columns:
    rmse = np.sqrt(mean_squared_error(Y_test[col], Y_pred[col]))
    r2 = r2_score(Y_test[col], Y_pred[col])
    metrics[col] = {'RMSE': rmse, 'R²': r2}

metrics_df = pd.DataFrame(metrics).T
print(metrics_df)