In [2]:
import pandas as pd
import numpy as np


filepath = '../../data/togo-dapaong_qc.csv'

# loading data
df = pd.read_csv(filepath)

print("Data loaded successfully!")
print("First 5 rows of the DataFrame:")
print(df.head())

print("\nDataFrame Info:")
df.info()

Data loaded successfully!
First 5 rows of the DataFrame:
          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-10-25 00:01 -1.3  0.0  0.0   0.0   0.0  24.8  94.5  0.9     1.1   
1  2021-10-25 00:02 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.1     1.6   
2  2021-10-25 00:03 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.2     1.4   
3  2021-10-25 00:04 -1.2  0.0  0.0   0.0   0.0  24.8  94.3  1.2     1.6   
4  2021-10-25 00:05 -1.2  0.0  0.0   0.0   0.0  24.8  94.0  1.3     1.6   

   WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0      0.4  227.6      1.1  977         0            0.0   24.7   24.4   
1      0.4  229.3      0.7  977         0            0.0   24.7   24.4   
2      0.3  228.5      2.9  977         0            0.0   24.7   24.4   
3      0.3  229.1      4.6  977         0            0.0   24.7   24.4   
4      0.4  227.5      1.6  977         0            0.0   24.7   24.4   

   Comments  
0       NaN  
1       NaN  
2    

In [3]:
print(df.describe())

                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      230.555040     151.258469     116.444352     226.144375   
std       322.532347     250.956962     156.520714     317.346938   
min       -12.700000       0.000000       0.000000       0.000000   
25%        -2.200000       0.000000       0.000000       0.000000   
50%         2.100000       0.000000       2.500000       4.400000   
75%       442.400000     246.400000     215.700000     422.525000   
max      1424.000000    1004.500000     805.700000    1380.000000   

                ModB           Tamb             RH             WS  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      219.568588      27.751788      55.013160       2.368093   
std       307.932510       4.758023      28.778732       1.462668   
min         0.000000      14.900000       3.300000       0.000000   
25%         0.000000      24.2000

In [4]:
missing_counts = df.isna().sum()
print(missing_counts)

#get total_rows and calculate the missing percentage
total_rows = len(df)
missing_percentage = (missing_counts / total_rows) * 100

#store the cols with missing more than 5%
cols_with_high_missing = missing_percentage[missing_percentage > 5]

#check if there are columns with high missing values 
if not cols_with_high_missing.empty:
    print(cols_with_high_missing)
else:
    print("No columns found with more than 5% missing values.")

Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64
Comments    100.0
dtype: float64


In [5]:
zscore_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

# calculate mean and median
mean = df.loc[:, zscore_cols].mean()
std = df.loc[:, zscore_cols].std()

# calculate z-scores
z_scores = (df.loc[:, zscore_cols] - mean) / std

# Check where the absolute Z-score is greater than 3
outlier_mask_zscore = (z_scores.abs() > 3).any(axis=1)

# This creates a new boolean column 'is_outlier_zscore' which is True for flagged rows
df.loc[:, 'is_outlier_zscore'] = outlier_mask_zscore

# --- Optional: View the flagged rows ---
print("--- Rows flagged as outliers based on |Z| > 3 ---")
print(df[df['is_outlier_zscore']].head()) 

--- Rows flagged as outliers based on |Z| > 3 ---
             Timestamp     GHI    DNI    DHI    ModA    ModB  Tamb    RH   WS  \
4985  2021-10-28 11:06  1139.0  805.1  466.1  1172.0  1154.0  29.8  70.8  2.2   
5410  2021-10-28 18:11    -1.2    0.0    0.0     0.0     0.0  29.7  63.5  6.6   
5411  2021-10-28 18:12    -1.0    0.0    0.0     0.0     0.0  29.3  62.4  6.9   
5413  2021-10-28 18:14    -0.8    0.0    0.0     0.0     0.0  28.6  63.7  7.1   
5420  2021-10-28 18:21    -1.3    0.0    0.0     0.0     0.0  27.7  64.5  6.1   

      WSgust  WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  \
4985     2.6      0.4  298.6     13.4  977         0            0.0   44.7   
5410     9.7      1.6  122.1     14.3  976         0            0.0   30.0   
5411     8.9      1.2  128.7     10.8  976         0            0.0   29.9   
5413     8.9      1.1  127.9     14.4  976         0            0.0   29.5   
5420     8.9      1.3  123.2     14.4  977         0            0.0   28.

In [6]:
# Drop

print("Dropping 'Comments' column due to 100% missing values.")
df = df.drop('Comments', axis=1)

print(df.columns)

Dropping 'Comments' column due to 100% missing values.
Index(['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS',
       'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation',
       'TModA', 'TModB', 'is_outlier_zscore'],
      dtype='object')


In [7]:
print("--- Handling Unrealistic Irradiance Values ---")

# Define the cleaning thresholds for each irradiance column
irradiance_cleaning_rules = {
    'GHI': {'min': 0, 'max': 1200}, 
    'DNI': {'min': 0, 'max': 1100}, 
    'DHI': {'min': 0, 'max': 300}   
}

# List of columns to apply these rules to
irradiance_cols_to_clean = ['GHI', 'DNI', 'DHI']

# Apply the cleaning rules to each column
for col in irradiance_cols_to_clean:
    if col in df.columns:
        print(f"Cleaning column '{col}'...")


        min_threshold = irradiance_cleaning_rules[col]['min']
        # Count values below the minimum before cleaning
        below_min_count = (df[col] < min_threshold).sum()
        if below_min_count > 0:
             # Use np.maximum to keep the larger value between the current value and the min threshold
             # This effectively sets values below the min threshold *to* the min threshold
             df[col] = np.maximum(df[col], min_threshold)
             print(f" - Replaced {below_min_count} values below {min_threshold} with {min_threshold}.")
        else:
            print(f" - No values found below {min_threshold}.")

  
        max_threshold = irradiance_cleaning_rules[col]['max']
         # Count values above the maximum before cleaning
        above_max_count = (df[col] > max_threshold).sum()
        if above_max_count > 0:
            # Use np.minimum to keep the smaller value between the current value and the max threshold
            # This effectively sets values above the max threshold *to* the max threshold
            df[col] = np.minimum(df[col], max_threshold)
            print(f" - Capped {above_max_count} values above {max_threshold} with {max_threshold}.")
        else:
            print(f" - No values found above {max_threshold}.")

    else:
        print(f" - Column '{col}' not found in DataFrame. Skipping cleaning for this column.")

print("\nIrradiance value cleaning complete (negatives set to 0, highs capped).")

--- Handling Unrealistic Irradiance Values ---
Cleaning column 'GHI'...
 - Replaced 257385 values below 0 with 0.
 - Capped 295 values above 1200 with 1200.
Cleaning column 'DNI'...
 - No values found below 0.
 - No values found above 1100.
Cleaning column 'DHI'...
 - No values found below 0.
 - Capped 82026 values above 300 with 300.

Irradiance value cleaning complete (negatives set to 0, highs capped).


In [8]:
# Define the output file path relative to your script
output_filepath = '../../data/togo_clean.csv'

# Export the DataFrame to CSV file
df.to_csv(output_filepath, index=False)