In [1]:
# 1. Imports
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [2]:
# 2. Load dataset
df = pd.read_csv("benin-malanville.csv")
df.head()

Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,2021-08-09 00:01,-1.2,-0.2,-1.1,0.0,0.0,26.2,93.4,0.0,0.4,0.1,122.1,0.0,998,0,0.0,26.3,26.2,
1,2021-08-09 00:02,-1.1,-0.2,-1.1,0.0,0.0,26.2,93.6,0.0,0.0,0.0,0.0,0.0,998,0,0.0,26.3,26.2,
2,2021-08-09 00:03,-1.1,-0.2,-1.1,0.0,0.0,26.2,93.7,0.3,1.1,0.5,124.6,1.5,997,0,0.0,26.4,26.2,
3,2021-08-09 00:04,-1.1,-0.1,-1.0,0.0,0.0,26.2,93.3,0.2,0.7,0.4,120.3,1.3,997,0,0.0,26.4,26.3,
4,2021-08-09 00:05,-1.0,-0.1,-1.0,0.0,0.0,26.2,93.3,0.1,0.7,0.3,113.2,1.0,997,0,0.0,26.4,26.3,


In [3]:
# 3. Preview structures
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

Shape: (525600, 19)
Columns: ['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation', 'TModA', 'TModB', 'Comments']


In [4]:
# 4. Check for negative irradiance values
irradiance_cols = ['GHI', 'DNI', 'DHI']

print("\n Negative Irradiance Values:")
for col in irradiance_cols:
    negative_count = df[df[col] < 0].shape[0]
    print(f"{col}: {negative_count} negative values")


 Negative Irradiance Values:
GHI: 258847 negative values
DNI: 275987 negative values
DHI: 259182 negative values


In [5]:
# 5. Outlier detection using Z-score
zscore_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[zscore_cols].apply(zscore)

outlier_mask = (np.abs(z_scores) > 3)
outlier_counts = outlier_mask.sum()

print("\n Outlier Counts (Z-score > 3):)")
print(outlier_counts[outlier_counts > 0])


 Outlier Counts (Z-score > 3):)
GHI         89
DHI       3738
ModA        27
ModB        63
WS        3109
WSgust    3500
dtype: int64


In [6]:
# 6. Missing values check
missing = df.isna().sum()
print("\n Missing Values:")
print(missing[missing > 0])


 Missing Values:
Comments    525600
dtype: int64


In [7]:
# 7. Check if 'Comments' is empty
if 'Comments' in df.columns:
    empty_comments = df['Comments'].isna()
    empty_count = empty_comments.sum()
    print(f"\n Empty 'Comments' rows: {empty_count} out of {len(df)}")


 Empty 'Comments' rows: 525600 out of 525600
