In [1]:
import pandas as pd

# --- 1. Load the raw Sierra Leone data ---
sl = pd.read_csv('../data/solar-measurements_sierraleone-bumbuna_qc.csv', encoding='latin1', low_memory=False)
print("✅ Raw Sierra Leone data loaded!")
display(sl.head())

# --- 2. Convert Timestamp safely ---
sl['Timestamp'] = pd.to_datetime(sl['Timestamp'], errors='coerce')

# --- 3. Force numeric conversion for key columns ---
numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust',
                'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation', 'TModA', 'TModB']

for col in numeric_cols:
    if col in sl.columns:
        sl[col] = pd.to_numeric(sl[col], errors='coerce')

# --- 4. Drop rows with all solar metrics missing ---
sl = sl.dropna(subset=['GHI', 'DNI', 'DHI'], how='all')

# --- 5. Remove negative solar values (only works after numeric conversion) ---
for col in ['GHI', 'DNI', 'DHI']:
    if col in sl.columns:
        sl.loc[sl[col] < 0, col] = None

# --- 6. Sort by timestamp and reset index ---
sl = sl.sort_values('Timestamp').reset_index(drop=True)

# --- 7. Save cleaned dataset ---
sl.to_csv('../data/sierraleone_clean.csv', index=False)
print("✅ Sierra Leone dataset cleaned and saved successfully!")


✅ Raw Sierra Leone data loaded!


Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,yyyy-mm-dd hh:mm,W/m²,W/m²,W/m²,W/m²,W/m²,°C,%,m/s,m/s,m/s,°N (to east),°,hPa,1 or 0,mm/min,°C,°C,
1,2021-10-30 00:01,-0.7,-0.1,-0.8,0,0,21.9,99.1,0,0,0,0,0,1002,0,0,22.3,22.6,
2,2021-10-30 00:02,-0.7,-0.1,-0.8,0,0,21.9,99.2,0,0,0,0,0,1002,0,0,22.3,22.6,
3,2021-10-30 00:03,-0.7,-0.1,-0.8,0,0,21.9,99.2,0,0,0,0,0,1002,0,0,22.3,22.6,
4,2021-10-30 00:04,-0.7,0,-0.8,0,0,21.9,99.3,0,0,0,0,0,1002,0,0.1,22.3,22.6,


  sl['Timestamp'] = pd.to_datetime(sl['Timestamp'], errors='coerce')


✅ Sierra Leone dataset cleaned and saved successfully!
