In [None]:
from pathlib import Path

COUNTRY = "togo-dapaong_qc"  
RAW_PATH = f"../data/raw/{COUNTRY}.csv"  # adjust path if notebook is inside 'notebooks/' folder
CLEAN_PATH = f"../data/{COUNTRY}_clean.csv"

# --- IMPORTS ---
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# --- LOAD AND CLEAN DATA ---
try:
    df = pd.read_csv(RAW_PATH)
    df_clean = df.dropna()  # simple cleaning example
    print(f"Data loaded successfully: {df_clean.shape[0]} rows")
except FileNotFoundError:
    print(f"File not found: {RAW_PATH}")



Data loaded successfully: 0 rows


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
plt.style.use("seaborn-v0_8-whitegrid")



In [None]:
df = pd.read_csv(RAW_PATH, parse_dates=['Timestamp'])
print(f"✅ Loaded {COUNTRY}: {df.shape[0]} rows, {df.shape[1]} columns")
df.info()
df.head()


In [None]:
# Summary stats for numeric columns
df.describe().T

# Missing values
na = df.isna().sum().to_frame("n_missing")
na["pct_missing"] = 100 * na["n_missing"] / len(df)
print("Columns with >5% missing:")
na_gt5 = na[na["pct_missing"] > 5]
na_gt5


In [None]:
cols = ['GHI','DNI','DHI','ModA','ModB','WS','WSgust']

# Compute Z-score
z = df[cols].apply(lambda x: (x - x.mean()) / x.std())
df["is_outlier"] = (abs(z) > 3).any(axis=1)
print("Outlier rows:", df["is_outlier"].sum())

# Fill missing values with median
for c in cols:
    df[c] = df[c].fillna(df[c].median())

# Drop outliers
df_clean = df[~df["is_outlier"]].copy()

# Save cleaned file
Path("data").mkdir(exist_ok=True)
df_clean.to_csv(CLEAN_PATH, index=False)
print(f"✅ Cleaned data saved to {CLEAN_PATH}")


In [None]:
df_clean = df_clean.sort_values("Timestamp")
df_clean.set_index("Timestamp", inplace=True)

df_clean[['GHI','DNI','DHI','Tamb']].plot(figsize=(12,5), title=f"{COUNTRY}: GHI, DNI, DHI, Tamb over Time")
plt.ylabel("Value")
plt.show()

df_clean["month"] = df_clean.index.month
df_clean.groupby("month")[["GHI","DNI","DHI"]].mean().plot(kind="bar", figsize=(10,4), title=f"{COUNTRY}: Average Monthly Irradiance")
plt.ylabel("Irradiance (W/m²)")
plt.show()


In [None]:
if "Cleaning" in df_clean.columns:
    df_clean.groupby("Cleaning")[["ModA","ModB"]].mean().plot(kind="bar", title=f"{COUNTRY}: ModA & ModB pre/post cleaning")
    plt.ylabel("Module Output")
    plt.show()


In [None]:
corr_cols = ['GHI','DNI','DHI','TModA','TModB','Tamb','RH','WS','WSgust']
sns.heatmap(df_clean[corr_cols].corr(), annot=True, cmap="coolwarm")
plt.title(f"{COUNTRY}: Correlation Heatmap")
plt.show()

sns.scatterplot(x='WS', y='GHI', data=df_clean)
plt.title('Wind Speed vs GHI')
plt.show()

sns.scatterplot(x='RH', y='Tamb', data=df_clean)
plt.title('Relative Humidity vs Temperature')
plt.show()

sns.scatterplot(x='RH', y='GHI', data=df_clean)
plt.title('Relative Humidity vs GHI')
plt.show()


In [None]:
from windrose import WindroseAxes
import matplotlib.pyplot as plt

try:
    ax = WindroseAxes.from_ax()
    ax.bar(df_clean['WD'], df_clean['WS'], 
           normed=True, 
           opening=0.8, 
           edgecolor='white')  # keep as keyword args
    ax.set_title(f"{COUNTRY}: Wind Rose")
    ax.set_legend()
    plt.show()
except Exception as e:
    print("⚠️ Wind rose plot failed. Try installing windrose:")
    print("    pip install windrose==1.6.8")
    print("Error details:", e)


In [None]:
sns.regplot(x='RH', y='Tamb', data=df_clean, scatter_kws={'alpha':0.4})
plt.title("Relative Humidity vs Temperature")
plt.show()

sns.regplot(x='RH', y='GHI', data=df_clean, scatter_kws={'alpha':0.4})
plt.title("Relative Humidity vs GHI")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df_clean['Tamb'], df_clean['GHI'], s=df_clean['RH']*0.4, alpha=0.4, c='orange', edgecolors='k')
plt.xlabel('Tamb (°C)')
plt.ylabel('GHI (W/m²)')
plt.title(f"{COUNTRY}: GHI vs Tamb (Bubble size = RH)")
plt.show()


In [None]:
summary = {
    "Rows (original)": len(df),
    "Rows (cleaned)": len(df_clean),
    "Outlier % removed": round(100*df["is_outlier"].mean(), 2),
    "Columns >5% missing": len(na_gt5),
    "Mean GHI (W/m²)": round(df_clean["GHI"].mean(), 2),
    "Mean Tamb (°C)": round(df_clean["Tamb"].mean(), 2),
    "Corr(GHI,DNI)": round(df_clean["GHI"].corr(df_clean["DNI"]), 3)
}
pd.DataFrame(summary, index=[COUNTRY])
