In [8]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Define file paths based on your repository structure
data_dir = "../data"
file_paths = {
    "benin": os.path.join(data_dir, "benin-malanville.csv"),
    "sierra_leone": os.path.join(data_dir, "sierraleone-bumbuna.csv"),  # Fixed whitespace in key
    "togo": os.path.join(data_dir, "togo-dapaong_qc.csv"),
}


def handle_missing_values(df, imputation_strategy="mean"):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if imputation_strategy == "mean":
        imputed_df = df[numeric_cols].fillna(df[numeric_cols].mean())
    elif imputation_strategy == "median":
        imputed_df = df[numeric_cols].fillna(df[numeric_cols].median())
    elif imputation_strategy in ["ffill", "bfill"]:
        imputed_df = df[numeric_cols].fillna(method=imputation_strategy)
    else:
        raise ValueError(f"Invalid imputation strategy: {imputation_strategy}")
    return pd.concat([df.drop(numeric_cols, axis=1), imputed_df], axis=1)


def handle_outliers(df, iqr_multiplier=1.5):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    q1 = df[numeric_cols].quantile(0.25)
    q3 = df[numeric_cols].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - iqr_multiplier * iqr
    upper_bound = q3 + iqr_multiplier * iqr
    
    # Filter out the outliers
    filtered_df = df[
        ~((df[numeric_cols] < lower_bound) | (df[numeric_cols] > upper_bound)).any(axis=1)
    ]
    
    return filtered_df


def clean_data(df, imputation_strategy="mean", handle_outliers_=True):
    cleaned_df = handle_missing_values(df.copy(), imputation_strategy)
    if handle_outliers_:
        cleaned_df = handle_outliers(cleaned_df.copy())
    return cleaned_df


dataframes = {}
for name, path in file_paths.items():
    dataframes[name] = pd.read_csv(path)

# Display the data for all CSV files
for name, df in dataframes.items():
    print(f"\n** {name.upper()} Data **")
    print(df.head())  # Display the first few rows
    print(df.info())  # Display data type and null information
    print(df.describe())  # Display statistical summary
    print(f"\nNull values in {name.upper()} Data:\n{df.isnull().sum()}")

# Clean the data
cleaned_dataframes = {}
for name, df in dataframes.items():
    cleaned_dataframes[name] = clean_data(df)

# Display the cleaned data for all CSV files
for name, df in cleaned_dataframes.items():
    print(f"\n** {name.upper()} Data - After Cleaning **")
    print(df.head())  # Display the first few rows of cleaned data
    print(df.info())  # Display data type and null information after cleaning
    print(f"\nNull values in {name.upper()} Data After Cleaning:\n{df.isnull().sum()}")



** BENIN Data **
          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-08-09 00:01 -1.2 -0.2 -1.1   0.0   0.0  26.2  93.4  0.0     0.4   
1  2021-08-09 00:02 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.6  0.0     0.0   
2  2021-08-09 00:03 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.7  0.3     1.1   
3  2021-08-09 00:04 -1.1 -0.1 -1.0   0.0   0.0  26.2  93.3  0.2     0.7   
4  2021-08-09 00:05 -1.0 -0.1 -1.0   0.0   0.0  26.2  93.3  0.1     0.7   

   WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0      0.1  122.1      0.0  998         0            0.0   26.3   26.2   
1      0.0    0.0      0.0  998         0            0.0   26.3   26.2   
2      0.5  124.6      1.5  997         0            0.0   26.4   26.2   
3      0.4  120.3      1.3  997         0            0.0   26.4   26.3   
4      0.3  113.2      1.0  997         0            0.0   26.4   26.3   

   Comments  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       NaN  
<c