## ðŸ“˜ Solar Resource Data â€“ Profiling, Cleaning & EDA Notebook

---

### 1. Import Libraries 



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


### 2. Load Dataset (Benin)

In [None]:
path = "../data/benin-malanville.csv"
df = pd.read_csv(path)
df.head()

### 3. Validate Timestamp & Data Types
#### Timestamp Parsing
Convert timestamp to datetime and check for issues.


In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df.info()

### 4. Summary Statistics & Missing-Value Report

In [None]:
# Summary
df.describe(include="all")

In [None]:
# Missing value count
missing = df.isna().sum()
missing


In [None]:
# Columns with >5% missing values
missing_5 = missing[missing > len(df) * 0.05]
missing_5


In [None]:
#An overview of the dataset: the last 5 rows
df.tail()

In [None]:
#An overview of the dataset: a random sample of 5 rows
df.sample(5)

In [None]:
#An overview of the dataset: the shape of the dataset
print(df.info())

### 5. Outlier Detection & Basic Cleaning

### Outlier Detection (Z-score Method)
Check GHI, DNI, DHI, ModA, ModB, WS, WSgust.

In [None]:
cols = ["GHI", "DNI", "DHI", "ModA", "ModB", "WS", "WSgust"]

z_scores = np.abs(stats.zscore(df[cols], nan_policy='omit'))
outliers = (z_scores > 3).sum()
outliers


### Remove/Clean Outliers

In [None]:
df_clean = df.copy()

# Convert Timestamp
df_clean["Timestamp"] = pd.to_datetime(df_clean["Timestamp"])
df_clean.sort_values("Timestamp", inplace=True)

# Replace negative irradiance values with zero
irradiance_cols = ["GHI", "DNI", "DHI", "ModA", "ModB"]
df_clean[irradiance_cols] = df_clean[irradiance_cols].clip(lower=0)

# Validate humidity and wind direction
df_clean["RH"] = df_clean["RH"].clip(0, 100)
df_clean["WD"] = df_clean["WD"].apply(lambda x: x % 360 if pd.notnull(x) else x)

# Fix Cleaning flag
df_clean["Cleaning"] = df_clean["Cleaning"].apply(lambda x: 1 if x == 1 else 0)

# Impute missing numeric values with median
df_clean.fillna(df_clean.median(numeric_only=True), inplace=True)

# Optional: Remove high outliers using Z-score (>3Ïƒ)
outlier_cols = ["GHI", "DNI", "DHI", "ModA", "ModB", "WS", "WSgust"]
z_scores = np.abs(stats.zscore(df_clean[outlier_cols], nan_policy='omit'))

# Keep only rows where all Z-scores <= 3
df_clean = df_clean[(z_scores <= 3).all(axis=1)]


### Export Cleaned File

In [None]:
df_clean.to_csv("../data/benin_clean.csv", index=False)
print("âœ… Cleaned dataset saved successfully!")


### 6. Time Series Plots

In [None]:
plt.figure(figsize=(15,5))
plt.plot(df_clean['Timestamp'], df_clean['GHI'])
plt.title("GHI Over Time")
plt.xlabel("Timestamp")
plt.ylabel("GHI (W/mÂ²)")
plt.show()


### 7. Cleaning Impact (ModA/ModB vs Cleaning Flag)

In [None]:
cleaning_effect = df_clean.groupby("Cleaning")[["ModA","ModB"]].mean()
cleaning_effect


In [None]:
cleaning_effect.plot(kind="bar", figsize=(8,4), title="Effect of Cleaning on ModA & ModB")
plt.ylabel("Average Reading")
plt.show()


### 8. Correlation Heatmap

In [None]:
corr_cols = ["GHI", "DNI", "DHI", "TModA", "TModB"]
plt.figure(figsize=(8,6))
sns.heatmap(df_clean[corr_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


### Scatter Plots

In [None]:
sns.scatterplot(data=df_clean, x="WS", y="GHI")
plt.title("Wind Speed vs GHI")
plt.show()


In [None]:
sns.scatterplot(data=df, x="RH", y="Tamb")
plt.title("RH vs Temperature")
plt.show()


### 9. Wind & Distribution Analysis

#### Wind Direction & Speed
Histogram and distribution.

Histogram for GHI

In [None]:
df["GHI"].plot(kind="hist", bins=40, figsize=(8,5), title="GHI Distribution")
plt.show()

Histogram for WS

In [None]:
df["WS"].plot(kind="hist", bins=40, figsize=(8,5), title="Wind Speed Distribution")
plt.show()


### 10. Temperature Analysis

Relationship Between RH, Temperature, & Solar Radiation


In [None]:
sns.scatterplot(data=df, x="RH", y="GHI")
plt.title("RH vs GHI")
plt.show()


### 11. Bubble Chart

Bubble Chart: GHI vs Temperature (Bubble Size = RH)


In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df["Tamb"], df["GHI"], s=df["RH"], alpha=0.4)
plt.xlabel("Temperature (Â°C)")
plt.ylabel("GHI (W/mÂ²)")
plt.title("GHI vs Temperature (Bubble = RH)")
plt.show()


## References

### ðŸ”— References
- Z-Score Outlier Detection: https://www.statisticshowto.com/probability-and-statistics/z-score/
- Seaborn Visualization Docs: https://seaborn.pydata.org/
- Time Series EDA Guide: https://pandas.pydata.org/docs/
- Wind Rose Examples: https://matplotlib.org/stable/gallery/specialty_plots/windrose.html
