In [1]:
import pandas as pd
import os
import numpy as np

data_dir = "../data"
file_paths = {
    "benin": os.path.join(data_dir, "benin-malanville.csv"),
    "sierra_leone": os.path.join(data_dir, "sierraleone-bumbuna.csv"),
    "togo": os.path.join(data_dir, "togo-dapaong_qc.csv"),
}

## Impute missing values in a DataFrame.

In [7]:
def handle_missing_values(df: pd.DataFrame, imputation_strategy: str = "mean") -> pd.DataFrame:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    if imputation_strategy == "mean":
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    elif imputation_strategy == "median":
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    elif imputation_strategy in ["ffill", "bfill"]:
        df[numeric_cols] = df[numeric_cols].fillna(method=imputation_strategy)
    else:
        raise ValueError(f"Invalid imputation strategy: {imputation_strategy}")
    return df

## Identify and remove outliers based on the IQR method.

In [8]:
def handle_outliers(df: pd.DataFrame, iqr_multiplier: float = 1.5) -> pd.DataFrame:
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    q1 = df[numeric_cols].quantile(0.25)
    q3 = df[numeric_cols].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - iqr_multiplier * iqr
    upper_bound = q3 + iqr_multiplier * iqr
    
    df = df[~((df[numeric_cols] < lower_bound) | (df[numeric_cols] > upper_bound)).any(axis=1)]
    return df

## Clean data by handling missing values and outliers.

In [9]:
def clean_data(df: pd.DataFrame, imputation_strategy: str = "mean", apply_outliers: bool = True) -> pd.DataFrame:
    cleaned_df = handle_missing_values(df.copy(), imputation_strategy)
    
    if apply_outliers:
        cleaned_df = handle_outliers(cleaned_df)
    return cleaned_df

In [10]:
dataframes = {name: pd.read_csv(path) for name, path in file_paths.items()}

In [6]:
for name, df in dataframes.items():
    print(f"\n** {name.upper()} Data **")
    print(df.head())
    print(df.info())

cleaned_dataframes = {name: clean_data(df) for name, df in dataframes.items()}

for name, df in cleaned_dataframes.items():
    print(f"\n** {name.upper()} Data - Null Values After Cleaning **")
    print(df.isnull().sum())


** BENIN Data **
          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-08-09 00:01 -1.2 -0.2 -1.1   0.0   0.0  26.2  93.4  0.0     0.4   
1  2021-08-09 00:02 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.6  0.0     0.0   
2  2021-08-09 00:03 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.7  0.3     1.1   
3  2021-08-09 00:04 -1.1 -0.1 -1.0   0.0   0.0  26.2  93.3  0.2     0.7   
4  2021-08-09 00:05 -1.0 -0.1 -1.0   0.0   0.0  26.2  93.3  0.1     0.7   

   WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0      0.1  122.1      0.0  998         0            0.0   26.3   26.2   
1      0.0    0.0      0.0  998         0            0.0   26.3   26.2   
2      0.5  124.6      1.5  997         0            0.0   26.4   26.2   
3      0.4  120.3      1.3  997         0            0.0   26.4   26.3   
4      0.3  113.2      1.0  997         0            0.0   26.4   26.3   

   Comments  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       NaN  
<c