# DS 3000 Project

### Analyzing Renewable Power and Weather Conditions

Khalid Altahan, Rohan Datta, Joao Langlois, Nicholas Moniz, Ayush Sharma

Methodology:

ETL
- Import data
- Clean data (check for missing values, duplicates, what are the data types, how many unique values, etc.)

Exploratory Data Analysis
- Correlation Matrix (find high correlations)
- Analysis of Energy Delta to Time
- Analysis of Energy Delta to GHI
- Analysis of Energy Delta to SunlightTime

- Polynomial expansion
- Derive columns

Data Preparation
- K-Fold Cross Validation
- Drop Columns


In [5]:
# Import libraries
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
# Read csv
df = pd.read_csv('./Renewable.csv')
df.drop(columns=['Time', 'sunlightTime', 'dayLength'], inplace=True)
df.head()

Unnamed: 0,Energy delta[Wh],GHI,temp,pressure,humidity,wind_speed,rain_1h,snow_1h,clouds_all,isSun,SunlightTime/daylength,weather_type,hour,month
0,0.0,0.0,1.6,1021.0,100.0,4.9,0.0,0.0,100.0,0.0,0.0,4.0,0.0,1.0
1,0.0,0.0,1.6,1021.0,100.0,4.9,0.0,0.0,100.0,0.0,0.0,4.0,0.0,1.0
2,0.0,0.0,1.6,1021.0,100.0,4.9,0.0,0.0,100.0,0.0,0.0,4.0,0.0,1.0
3,0.0,0.0,1.6,1021.0,100.0,4.9,0.0,0.0,100.0,0.0,0.0,4.0,0.0,1.0
4,0.0,0.0,1.7,1020.0,100.0,5.2,0.0,0.0,100.0,0.0,0.0,4.0,1.0,1.0


In [7]:
# Finding the data types, adjusting if necessary
# df.Time = pd.to_datetime(df['Time'])
df.dtypes

Unnamed: 0,0
Energy delta[Wh],float64
GHI,float64
temp,float64
pressure,float64
humidity,float64
wind_speed,float64
rain_1h,float64
snow_1h,float64
clouds_all,float64
isSun,float64


In [8]:
#df.columns = ["Time","Energy Delta[Wh]","GHI","temp","pressure","humidity","wind_speed","rain_1h","snow_1h","clouds_all","isSun","sunlightTime","dayLength","SunlightTime/daylength","weather_type","hour","month"]
#cols = ["Energy Delta[Wh]","GHI","temp","pressure","humidity","wind_speed","rain_1h","snow_1h","sunlightTime","dayLength","SunlightTime/daylength","weather_type","hour","month"]

# Checking for null values in each column
count = df.isnull().sum()
print(count)

Energy delta[Wh]          1
GHI                       1
temp                      1
pressure                  1
humidity                  1
wind_speed                1
rain_1h                   1
snow_1h                   1
clouds_all                1
isSun                     1
SunlightTime/daylength    1
weather_type              1
hour                      1
month                     1
dtype: int64


In [9]:
# Checking for duplicate values
df.duplicated().sum()

33661

In [None]:
numerical_cols = ["Energy delta[Wh]","GHI","temp","pressure","humidity","wind_speed","rain_1h","snow_1h","SunlightTime/daylength","weather_type","hour","month"]

plt.figure(figsize=(15, 10))
sns.pairplot(df[numerical_cols], height=2.5)
plt.tight_layout()
plt.show()

<Figure size 1500x1000 with 0 Axes>

In [None]:
# Making a correlation matrix of all the features
correlation_matrix = df.corr()

plt.figure(figsize =(15,15))
hm = sns.heatmap(correlation_matrix,
                 cbar=True,
                 annot=True,
                 square=True,
                 fmt='.2f',
                 annot_kws={'size': 10},
                 cbar_kws={'shrink': 0.8},
                 yticklabels=correlation_matrix.columns,
                 xticklabels=correlation_matrix.columns)
plt.title("Correlation Matrix", fontsize=16)
plt.xlabel("Features", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.show()

In [None]:
#1

# Using most correlated pair set values to be trained
X = df[['Energy delta[Wh]']].values
Y = df[['GHI']].values

# Split data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Simple Linear Model on non-normalized data
non_norm_model = LinearRegression()
non_norm_model.fit(X_train, Y_train)

Y_test_non_norm_predict = non_norm_model.predict(X_test)

#getting RMSE and R2 for non normalized models
rmse_test_non_norm = np.sqrt(mean_squared_error(Y_test, Y_test_non_norm_predict))
r2_test_non_norm = r2_score(Y_test, Y_test_non_norm_predict)

#Non-norm data
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.scatter(X_test, Y_test, color='green', label='Test Data')
plt.plot(X_test, Y_test_non_norm_predict, color='black', lw=2, label='Prediction')
plt.xlabel('Energy delta[Wh]')
plt.ylabel('GHI')
plt.title('Non-Normalized: Actual vs Predicted')
plt.legend()

plt.tight_layout()
plt.show()

print(f"RMSE: {rmse_test_non_norm:.2f} R2: {r2_test_non_norm:.2f}")

In [None]:
#3

# Using most correlated pair set values to be trained
X = df[['temp', 'pressure', 'wind_speed', 'humidity', 'clouds_all', 'SunlightTime/daylength', 'month']]
Y = df[['GHI']].values

# Split data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the XGBoost Regressor
model = xgb.XGBRegressor(
    n_estimators=200,     # Number of trees
    learning_rate=0.2,    # Step size shrinkage
    max_depth=7,         # Maximum depth of a tree
    subsample=0.8,        # Row sampling
    colsample_bytree=0.8, # Feature sampling
    early_stopping_rounds = 10,
    reg_lambda = 1000,
    random_state=42
)

# Train the model
model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)])

# Make predictions
Y_pred = model.predict(X_test)
Y_train_pred = model.predict(X_train)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
r2 = r2_score(Y_test, Y_pred)

rmse_train = np.sqrt(mean_squared_error(Y_train, Y_train_pred))
r2_train = r2_score(Y_train, Y_train_pred)

print(f"Testing: RMSE: {rmse} r2: {r2}")
print(f"Training: RMSE: {rmse_train} r2: {r2_train}")

# Plot feature importance
xgb.plot_importance(model, importance_type="weight")
plt.show()