In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import warnings
from functions import *  
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
def train_model(X, y, df, **kwargs):
	
	if 'year' in kwargs:
		year_mask = df['date'].dt.year == kwargs['year']
		X_train, X_test, y_train, y_test = train_test_split(X[~year_mask], y[~year_mask], test_size=0.2, random_state=123)
		X_val, y_val = X[year_mask], y[year_mask]
	else:
		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

	# Define multivariate linear regression model
	model = LinearRegression()

	# Train the model
	model.fit(X_train, y_train)

	# Make predictions
	preds = model.predict(X_test)


	return model, X_test, y_test, preds


In [4]:
def evaluate_model(model, X_test, y_test, preds):
	rmse = np.sqrt(mean_squared_error(y_test, preds))
	r_value = np.corrcoef(y_test.squeeze(), preds.squeeze())[0, 1]
	print("RMSE:", rmse)
	print("R Value:", r_value)
	print('-' * 50)
	return rmse, r_value

In [5]:
results_dict = {"State": [], "RMSE": [], "R Value": []}

for state, subdiv in states_subdiv_mapping.items():
	print(state)
	df, df_energy = load_data(state)
	df = process_data(df, df_energy, subdiv)
	
	df = df[feature_names_full + ['date', 'energy_met_MU']]
	df = df.dropna()
	
	
	X = df[feature_names_full]
	y = df['energy_met_MU']

	model, X_test, y_test, preds = train_model(X, y, df)
	rmse, r_value = evaluate_model(model, X_test, y_test, preds)

	results_dict["State"].append(state)
	results_dict["RMSE"].append(rmse)
	results_dict["R Value"].append(r_value)

Andhra_Pradesh
RMSE: 23.131496785937365
R Value: 0.7127435177321153
--------------------------------------------------
Arunachal_Pradesh
RMSE: 0.2329721055233904
R Value: 0.30824622383686623
--------------------------------------------------
Assam
RMSE: 10.149462934139676
R Value: 0.7701534922542231
--------------------------------------------------
Bihar
RMSE: 79.46913647612523
R Value: 0.4972452286815522
--------------------------------------------------
Chhattisgarh
RMSE: 14.676745403027779
R Value: 0.6924971226123652
--------------------------------------------------
Chandigarh
RMSE: 0.930106590715636
R Value: 0.8418875297183736
--------------------------------------------------
NCT_of_Delhi
RMSE: 15.46654546118503
R Value: 0.864876357904141
--------------------------------------------------
Goa
RMSE: 1.414740909764046
R Value: 0.5871610259383757
--------------------------------------------------
Gujarat
RMSE: 41.771460992393685
R Value: 0.6723398033438411
-------------------------

In [7]:
results_df = pd.DataFrame.from_dict(results_dict)
print(results_df)
results_df.to_csv("outputs/model_results_linear_regression.csv", index=False)

                State        RMSE   R Value
0      Andhra_Pradesh   23.131497  0.712744
1   Arunachal_Pradesh    0.232972  0.308246
2               Assam   10.149463  0.770153
3               Bihar   79.469136  0.497245
4        Chhattisgarh   14.676745  0.692497
5          Chandigarh    0.930107  0.841888
6        NCT_of_Delhi   15.466545  0.864876
7                 Goa    1.414741  0.587161
8             Gujarat   41.771461  0.672340
9    Himachal_Pradesh    3.744226  0.341318
10            Haryana   36.235686  0.800825
11          Jharkhand    8.586752  0.330498
12  Jammu_and_Kashmir    3.568084  0.712411
13          Karnataka   26.062472  0.816625
14             Kerala    6.655408  0.824117
15        Maharashtra   32.714288  0.786650
16          Meghalaya    0.688960  0.673414
17            Manipur    0.366682  0.552897
18     Madhya_Pradesh   28.135054  0.830906
19            Mizoram    0.196440  0.476164
20           Nagaland    0.476736  0.355114
21             Odisha   15.01000