## Importing the Dependencies

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Loading the data

In [None]:
import pandas as pd

df = pd.read_csv("../data/benin-malanville.csv")
df = df.drop(df.columns[-1], axis=1)
print(df.shape)
df.head()


# 1. Summary Statistics & Missing-Value Report

In [None]:
print("An overview of the dataset: the first 5 rows")
df.head()

In [None]:
print("An overview of the dataset: the last 5 rows")
df.tail()

In [None]:
print("An overview of the dataset: a random sample of 5 rows")
df.sample(5)

In [None]:
print("An overview of the dataset: the shape of the dataset")
print(df.info())

In [None]:
print("Check for missing values:")
print(df.isna().sum())

In [None]:
print("The description of the numeric columns:")
print(df.describe())

In [None]:
#Overview of categorical columns
print("The description of the categorical columns:")
print(df.describe(include=['object']))

In [None]:
print("The description of all columns:")
print(df.describe(include='all'))

In [None]:
# Exact duplicate rows
dup_count = df.duplicated().sum()
print("Duplicate rows:", dup_count)

# Cardinality (uniqueness) for categoricals
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
cardinality = {c: df[c].nunique() for c in cat_cols}
print("Cardinality (categoricals):", cardinality)


## 2) Univariate Analysis

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    plt.figure()
    sns.histplot(df[c], kde=True)
    plt.title(f"Distribution: {c}")
    plt.xlabel(c)
    plt.ylabel("Count")
    plt.show()


# Box Plots (numeric)

In [None]:
for c in num_cols:
    plt.figure()
    plt.boxplot(df[c].dropna(), vert=True)
    plt.title(f"Box plot: {c}")
    plt.ylabel(c)
    plt.show()


Frequency Table (categorical)

In [23]:
for c in cat_cols:
    display(df[c].value_counts().rename("count").to_frame())


Unnamed: 0_level_0,count
Timestamp,Unnamed: 1_level_1
2021-08-09 00:01,1
2021-08-09 00:02,1
2021-08-09 00:03,1
2021-08-09 00:04,1
2021-08-09 00:05,1
...,...
2022-08-08 23:56,1
2022-08-08 23:57,1
2022-08-08 23:58,1
2022-08-08 23:59,1


## 3) Outlier Detection

Z-Score Method (|z| > 3)

In [24]:
z = np.abs(stats.zscore(df[num_cols], nan_policy="omit"))
z_outliers_mask = (z > 3).any(axis=1)
print("Z-score outlier rows:", int(z_outliers_mask.sum()))


Z-score outlier rows: 17843


In [26]:
# Keep only rows where all |Z| <= 3 (no outliers)
df_clean = df[(z < 3).all(axis=1)]

print("After removing Z-score outliers:", df_clean.shape)

# 5️⃣ Save cleaned dataset
df_clean.to_csv("../data/BENIN_EDA_RESULT.csv", index=False)

print("Cleaned dataset saved to: data/iris_clean.csv")

After removing Z-score outliers: (507757, 18)
Cleaned dataset saved to: data/iris_clean.csv
