<a href="https://colab.research.google.com/github/iamakashkumar09/Solar_Power_Analysis_ML/blob/main/Project_Notebook_checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Solar Power Generation analysis

## Team Members
### 1.Ashish Chauhan
### 2.Akash Kumar Gaud
### 3.Anish Chauhan
### 4.Ashish Gautam
### 5.Aryan Shrikant Jadhao

# Step 1. Data importing and preprocessing

In [1]:
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [2]:
#imported the csv file

df=pd.read_csv("powerPlantDataBerkeley.csv")
df.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'powerPlantDataBerkeley.csv'

In [None]:
df.info()

* As clearly visible the dataset is a time series dataset but I didn't sort it because it was already sorted.
* Secondly there are no columns with object type values hence there was no need of converting any column to pandas category and assigning the codes back to column to convert object type column to numeric type

In [None]:
X=df.drop("Power Generated",axis=1)
y=df["Power Generated"]


In [None]:
'''Splitting data into train and test split of 8:2'''
train_size=round(0.7*len(df))
val_size=round(train_size+0.15*len(df))

'''
Splitting with train_test_split function won't work here as it shuffles the data before splitting but we avoid that in
time series dataset
'''
X_train,y_train=X[:train_size],y[:train_size]
X_val,y_val=X[train_size:val_size],y[train_size:val_size]
X_test,y_test=X[val_size:],y[val_size:]



In [None]:
'''just checking if splitting is done correctly'''

X_train.shape,y_train.shape,X_val.shape,y_val.shape,X_test.shape,y_test.shape

## Filling missing data in train,test and validation set

### Train set

In [None]:

X_train.isna().sum()

In [None]:
'''Filling average wind column's missing value with mean of the column'''

X_train["Average Wind Speed (Period)"].fillna(X_train["Average Wind Speed (Period)"].mean(),inplace=True)
X_train.isna().sum()

In [None]:
y_train.isna().sum()

### Validation set

In [None]:
X_val.isna().sum()

In [None]:
y_val.isna().sum()

### Test Set

In [None]:
X_test.isna().sum()

In [None]:
y_test.isna().sum()

# Step 2. Modelling & Evaluation

In [None]:
'''Custom evaluation function'''

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def model_score(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)  # Mean Absolute Error
    mse = mean_squared_error(y_true, y_pred)  # Mean Squared Error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    r2 = r2_score(y_true, y_pred)  # R-squared (Coefficient of Determination)

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R2):", r2)

In [None]:
'''Applying Linear Regression model to dataset'''
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

y_pred_lr=model.predict(X_val)
model_score(y_val,y_pred_lr)

In [None]:
'''Applying RandomForestRegressor to dataset'''
from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor(n_estimators=100, random_state=42,n_jobs=-1)
model2.fit(X_train, y_train)
y_pred_rf=model2.predict(X_val)
model_score(y_val,y_pred_rf)


# Hyper Parameter Tuning of RandomForestRegressor

### We will use Randomized Search CV to find best values of hyperparameters

#### Adjusting the following hyper parameters
* max_depth
* min_samples_leaf
* min_samples_split
* n_estimators

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

grid = {
    "n_estimators": [10, 100, 200, 500, 1000, 1200],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4]
}

rf = RandomForestRegressor(n_jobs=-1)
rs = RandomizedSearchCV(estimator=rf, param_distributions=grid, n_iter=20, cv=5)

rs.fit(X_train,y_train)

In [None]:
rs.best_params_


In [None]:
rs_y_pred=rs.predict(X_val)

In [None]:
model_score(y_val,rs_y_pred)

# RNN (Recurrent Neural Network)

Using LSTM Training Block

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Step 1: Scale the input features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Scale the target variable using a separate scaler
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

# Step 3: Create sequences
def create_sequences(X, y, seq_len):
    Xs, ys = [], []
    for i in range(len(X) - seq_len):
        Xs.append(X[i:i+seq_len])
        ys.append(y[i+seq_len])
    return np.array(Xs), np.array(ys)

SEQ_LEN = 60
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, SEQ_LEN)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, SEQ_LEN)


In [None]:
model_rnn = Sequential()
model_rnn.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
model_rnn.add(LSTM(32, activation='relu'))
model_rnn.add(Dense(1))
model_rnn.compile(optimizer='adam', loss='mse')

model_rnn.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, validation_data=(X_test_seq, y_test_seq))

Training

Prediction and Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

In [None]:
y_pred_rnn = model_rnn.predict(X_test_seq)
y_pred_rnn_inv = y_scaler.inverse_transform(y_pred_rnn)
y_test_seq_inv = y_scaler.inverse_transform(y_test_seq.reshape(-1,1))

rmse_rnn = np.sqrt(mean_squared_error(y_test_seq_inv, y_pred_rnn_inv))
mae_rnn = mean_absolute_error(y_test_seq_inv, y_pred_rnn_inv)
r2_rnn = r2_score(y_test_seq_inv, y_pred_rnn_inv)

print("Root Mean Squared Error is : ", rmse_rnn)
print("Mean Absolute Error is : ", mae_rnn)
print("R2 Score is : ", r2_rnn)


In [None]:
model_rnn = Sequential()
model_rnn.add(LSTM(64, return_sequences=True, activation='tanh', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
model_rnn.add(LSTM(32, activation='tanh'))
model_rnn.add(Dense(1))
model_rnn.compile(optimizer='adam', loss='mse')

model_rnn.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, validation_data=(X_test_seq, y_test_seq))

In [None]:
y_pred_rnn = model_rnn.predict(X_test_seq)
y_pred_rnn_inv = y_scaler.inverse_transform(y_pred_rnn)
y_test_seq_inv = y_scaler.inverse_transform(y_test_seq.reshape(-1,1))

rmse_rnn = np.sqrt(mean_squared_error(y_test_seq_inv, y_pred_rnn_inv))
mae_rnn = mean_absolute_error(y_test_seq_inv, y_pred_rnn_inv)
r2_rnn = r2_score(y_test_seq_inv, y_pred_rnn_inv)

print("Root Mean Squared Error is : ", rmse_rnn)
print("Mean Absolute Error is : ", mae_rnn)
print("R2 Score is : ", r2_rnn)