In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from windrose import WindroseAxes
import seaborn as sns
from scipy.stats import zscore

In [None]:
data_benin = pd.read_csv('../data/benin-malanville.csv')

## Data Overview

In [None]:
data_benin.head()

## Data cleaning

In [None]:
missing_values = data_benin.isnull().sum()
print("missing_values in each column")
print(missing_values)

In [None]:
columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust', 'Precipitation']

# Count negative values in each of these columns
negative_values_report = data_benin[columns_to_check].lt(0).sum()
print("Number of negative values in each column:")
print(negative_values_report)

In [None]:
# Remove rows where any of the listed columns contain negative values
data_benin_cleaned = data_benin[(data_benin[columns_to_check] >= 0).all(axis=1)]
print(f"Original DataFrame size: {data_benin.shape}")
print(f"Cleaned DataFrame size: {data_benin_cleaned.shape}")

#### Exploratory Data Analysis (EDA)

In [None]:

data_benin_cleaned['Timestamp'] = pd.to_datetime(data_benin_cleaned['Timestamp'])  # Ensure Timestamp is in datetime format
data_benin_cleaned.set_index('Timestamp', inplace=True)  # Set Timestamp as the index

# Plot GHI over time
plt.figure(figsize=(12, 6))
plt.plot(data_benin_cleaned.index, data_benin_cleaned['GHI'], color='orange', label='GHI (W/m²)')
plt.xlabel('Time')
plt.ylabel('GHI (W/m²)')
plt.title('Global Horizontal Irradiance (GHI) Over Time')
plt.xticks(rotation=45)
plt.legend()
plt.show()


In [None]:
# 2️ Plot the correlation heatmap
columns_to_correlate = ['GHI', 'DNI', 'DHI', 'Tamb', 'RH', 'WS', 'TModA', 'TModB']
correlation_matrix = data_benin_cleaned[columns_to_correlate].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, linewidths=0.5)
plt.title('Correlation Heatmap of Key Variables')
plt.show()

## Wind Analysis

In [None]:
# ----------------------- Wind Analysis -----------------------
# Wind Rose Plot
def wind_rose_plot(data_benin_cleaned):
    ax = WindroseAxes.from_ax()
    ax.bar(data_benin_cleaned['WD'], data_benin_cleaned['WS'], normed=True, opening=0.8, edgecolor='white')
    ax.set_legend(title="Wind Speed (m/s)")
    plt.title("Wind Rose")
    plt.show()

wind_rose_plot(data_benin_cleaned)

## Temperature Analysis

In [None]:

# ----------------------- Temperature Analysis -----------------------
# Relative Humidity vs Temperature and Solar Radiation
sns.scatterplot(x=data_benin_cleaned['RH'], y=data_benin_cleaned['Tamb'], hue=data_benin_cleaned['GHI'], palette='coolwarm')
plt.title("RH vs Temperature with GHI as hue")
plt.xlabel("Relative Humidity (%)")
plt.ylabel("Temperature (°C)")
plt.show()

## Histograms 

In [None]:
# ----------------------- Histograms -----------------------
variables = ['GHI', 'DNI', 'DHI', 'WS', 'Tamb']
for var in variables:
    plt.hist(data_benin_cleaned[var].dropna(), bins=30, alpha=0.7, label=var)
    plt.title(f"Histogram of {var}")
    plt.xlabel(var)
    plt.ylabel("Frequency")
    plt.show()


## Z-Score Analysis

In [None]:
# ----------------------- Z-Score Analysis -----------------------
# Flagging Anomalous Points
data_benin_cleaned['GHI_Z'] = zscore(data_benin_cleaned['GHI'])
anomalies = data_benin_cleaned[np.abs(data_benin_cleaned['GHI_Z']) > 3]
print(f"Number of anomalies in GHI: {len(anomalies)}")

## Bubble Chart

In [None]:
# ----------------------- Bubble Chart -----------------------
# GHI vs Tamb vs WS with RH as bubble size
plt.scatter(data_benin_cleaned['Tamb'], data_benin_cleaned['GHI'], s=data_benin_cleaned['RH'], alpha=0.5, label="RH size")
plt.title("GHI vs Tamb vs WS (Bubble size = RH)")
plt.xlabel("Temperature (°C)")
plt.ylabel("Global Horizontal Irradiance (GHI)")
plt.show()