In [2]:
import pandas as pd

# --- 1. Load the raw Togo data ---
togo = pd.read_csv('../data/solar-measurements_togo-davie_qc.csv', encoding='latin1', low_memory=False)
print("✅ Raw Togo data loaded!")
display(togo.head())

# --- 2. Convert Timestamp safely ---
togo['Timestamp'] = pd.to_datetime(togo['Timestamp'], errors='coerce')

# --- 3. Force numeric conversion for key columns ---
numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 
                'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation', 'TModA', 'TModB']

for col in numeric_cols:
    if col in togo.columns:
        togo[col] = pd.to_numeric(togo[col], errors='coerce')

# --- 4. Drop rows with all solar metrics missing ---
togo = togo.dropna(subset=['GHI', 'DNI', 'DHI'], how='all')

# --- 5. Remove negative solar values ---
for col in ['GHI', 'DNI', 'DHI']:
    if col in togo.columns:
        togo.loc[togo[col] < 0, col] = None

# --- 6. Sort by timestamp ---
togo = togo.sort_values('Timestamp').reset_index(drop=True)

# --- 7. Save cleaned dataset ---
togo.to_csv('../data/togo_clean.csv', index=False)
print("✅ Togo dataset cleaned and saved successfully!")


✅ Raw Togo data loaded!


Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,yyyy-mm-dd hh:mm,W/m²,W/m²,W/m²,W/m²,W/m²,°C,%,m/s,m/s,m/s,°N (to east),°,hPa,1 or 0,mm/min,°C,°C,
1,2021-11-03 00:01,-1.4,-0.3,-1.3,0,0,23.1,98.2,0,0,0,0,0,1006,0,0,22.9,22.9,
2,2021-11-03 00:02,-1.4,-0.3,-1.3,0,0,23.1,98.2,0,0,0,0,0,1006,0,0,22.9,22.9,
3,2021-11-03 00:03,-1.4,-0.3,-1.3,0,0,23.1,98.2,0,0,0,0,0,1006,0,0,22.8,22.8,
4,2021-11-03 00:04,-1.4,-0.3,-1.3,0,0,23.1,98.1,0,0,0,0,0,1006,0,0,22.8,22.8,


  togo['Timestamp'] = pd.to_datetime(togo['Timestamp'], errors='coerce')


✅ Togo dataset cleaned and saved successfully!


In [3]:
togo_clean = pd.read_csv('../data/togo_clean.csv')
togo_clean.info()
togo_clean.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525600 entries, 0 to 525599
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Timestamp      525600 non-null  object 
 1   GHI            262595 non-null  float64
 2   DNI            254310 non-null  float64
 3   DHI            261533 non-null  float64
 4   ModA           525600 non-null  float64
 5   ModB           525600 non-null  float64
 6   Tamb           525600 non-null  float64
 7   RH             525600 non-null  float64
 8   WS             525600 non-null  float64
 9   WSgust         525600 non-null  float64
 10  WSstdev        525600 non-null  float64
 11  WD             525600 non-null  float64
 12  WDstdev        525600 non-null  float64
 13  BP             525600 non-null  float64
 14  Cleaning       525600 non-null  float64
 15  Precipitation  525600 non-null  float64
 16  TModA          525600 non-null  float64
 17  TModB          525600 non-nul

Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,2021-11-03 00:01:00,,,,0.0,0.0,23.1,98.2,0.0,0.0,0.0,0.0,0.0,1006.0,0.0,0.0,22.9,22.9,
1,2021-11-03 00:02:00,,,,0.0,0.0,23.1,98.2,0.0,0.0,0.0,0.0,0.0,1006.0,0.0,0.0,22.9,22.9,
2,2021-11-03 00:03:00,,,,0.0,0.0,23.1,98.2,0.0,0.0,0.0,0.0,0.0,1006.0,0.0,0.0,22.8,22.8,
3,2021-11-03 00:04:00,,,,0.0,0.0,23.1,98.1,0.0,0.0,0.0,0.0,0.0,1006.0,0.0,0.0,22.8,22.8,
4,2021-11-03 00:05:00,,,,0.0,0.0,23.1,98.1,0.0,0.0,0.0,0.0,0.0,1006.0,0.0,0.0,22.8,22.7,
