In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

df = pd.read_csv("../data/togo-dapaong_qc.csv", parse_dates=["Timestamp"])
df.head()


In [None]:
df.describe(include=[np.number])   # summary stats
df.isna().sum()                    # missing values


In [None]:
z_cols = ["GHI","DNI","DHI","ModA","ModB","WS","WSgust"]
z = df[z_cols].apply(lambda col: (col - col.mean())/col.std(ddof=0))
outliers = (z.abs() > 3)

# Median imputation
for col in z_cols + ["Tamb","RH"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Drop extreme outliers
df_clean = df[(~outliers).all(axis=1)].copy()


In [None]:
df_clean.to_csv("data/togo-dapaong_qc_clean.csv", index=False)


In [None]:
plt.figure(figsize=(12,6))
for c in ["GHI","DNI","DHI"]:
    if c in df_clean.columns:
        plt.plot(df_clean["Timestamp"], df_clean[c], label=c)
plt.legend(); plt.title("Solar Irradiance vs Time")


In [None]:
sns.heatmap(df_clean[["GHI","DNI","DHI","TModA","TModB"]].corr(),
            annot=True, cmap="coolwarm")

sns.scatterplot(data=df_clean, x="WS", y="GHI")
sns.scatterplot(data=df_clean, x="RH", y="Tamb")


In [None]:
df_clean["GHI"].hist(bins=30)
df_clean["WS"].hist(bins=30)

plt.scatter(df_clean["Tamb"], df_clean["GHI"],
            s=df_clean["RH"], alpha=0.5)
plt.xlabel("Tamb"); plt.ylabel("GHI")
plt.title("Bubble Chart: GHI vs Tamb (size=RH)")
