In [1]:
import numpy as np
from scipy.stats import linregress
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import pandas as pd
import shap
from functions import *
import warnings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
def evaluate_model(xg_reg, X_test, y_test, preds):
	rmse = np.sqrt(mean_squared_error(y_test, preds))
	sn = rmse / np.std(y_test.values)
	_,_,r_value,_,_ = linregress(y_test.values, preds)
	
	print(f"State: {state}")
	print("RMSE:", rmse)
	print("SN:", sn)
	print(linregress(y_test.values, preds))
	print('-'*50)
	
	return r_value, xg_reg.n_estimators

In [4]:
# Load hyperparameters
hyperparameters_df = pd.read_csv("best_hyperparameters.csv")
hyperparameters_dict = hyperparameters_df.set_index('state').T.to_dict()

results_dict = {"State":[], "R Value":[], "Number of Trees":[]}

In [7]:
for state, subdiv in states_subdiv_mapping.items():
	df, df_energy = load_data(state)
	df = process_data(df, df_energy, subdiv)
	
	df = df[feature_names_full + ['date', 'energy_met_MU']]
	df = df.dropna()
	
	
	X = df[feature_names_full]
	y = df['energy_met_MU']
	
	best_params = hyperparameters_dict[state]
	
	xg_reg, X_test, y_test, preds = train_model(X, y, df, best_params)#, year=2019)
	r_value, n_trees = evaluate_model(xg_reg, X_test, y_test, preds)

	y_pred = xg_reg.predict(X)
	eval_results = {"date":df['date'].values, "observed":y, "predicted":y_pred}
	eval_df = pd.DataFrame.from_dict(eval_results)
	eval_df.to_csv(f"outputs/{state}.csv", index=False)

	results_dict["State"].append(state)
	results_dict["R Value"].append(r_value)
	results_dict["Number of Trees"].append(n_trees)

State: Andhra_Pradesh
RMSE: 14.982868193070361
SN: 0.4433607142471069
LinregressResult(slope=0.7409118121668669, intercept=1.0511315918210389, rvalue=0.9009358267110372, pvalue=1.7832260079007572e-196, stderr=0.015414601582678811, intercept_stderr=0.5217381273251022)
--------------------------------------------------
State: Arunachal_Pradesh
RMSE: 0.2217809878190346
SN: 0.8855399254438959
LinregressResult(slope=0.20445588748501592, intercept=0.0043351106549375, rvalue=0.4657728986986308, pvalue=2.526574203630508e-30, stderr=0.01677796610360168, intercept_stderr=0.004206652755966903)
--------------------------------------------------
State: Assam
RMSE: 6.196827577059516
SN: 0.42541256067679895
LinregressResult(slope=0.8378734458033645, intercept=-0.05063674960962572, rvalue=0.9052362446598187, pvalue=2.221414771992275e-201, stderr=0.016987506913038578, intercept_stderr=0.24769961717523578)
--------------------------------------------------
State: Bihar
RMSE: 60.62606986924746
SN: 0.6589

In [8]:
results_df = pd.DataFrame.from_dict(results_dict)

print(results_df)

results_df.to_csv("outputs/model_results_xg.csv", index=False)

                State   R Value  Number of Trees
0      Andhra_Pradesh  0.900936              200
1   Arunachal_Pradesh  0.465773              200
2               Assam  0.905236              200
3               Bihar  0.753971              200
4        Chhattisgarh  0.912108              200
5          Chandigarh  0.972473              200
6        NCT_of_Delhi  0.980916              200
7                 Goa  0.730916              200
8             Gujarat  0.905806              200
9    Himachal_Pradesh  0.686400              200
10            Haryana  0.963931              200
11          Jharkhand  0.641986              200
12  Jammu_and_Kashmir  0.795268              200
13          Karnataka  0.930933              200
14             Kerala  0.908326              200
15        Maharashtra  0.917161              200
16          Meghalaya  0.759368              200
17            Manipur  0.628516              200
18     Madhya_Pradesh  0.967082              200
19            Mizora