In [1]:
import pandas as pd
import numpy as np

Below code is used to clean the data. Below steps are done
1. Read the CSV file from guthub and describe it
2. Identify the timestamp column, convert to datetime index, verify 10-minute interval
3. Remove any exact duplicate rows
4. Handle missing values and if found take a mean of it
5. Check for gaps any missing timestamps
6. Fill Categorical value with NAN

In [2]:
# CSV URL from gihub
RAW_URL = "https://raw.githubusercontent.com/ibimohamed83/Data_Analytics_project/refs/heads/main/dataset/energydata_complete.csv"
# Defining column name
DATE_COL = "date"
EXPECTED_FREQ = "10min"

In [None]:

# Read CSV + describe
df = pd.read_csv(RAW_URL)
# Describing the features in the csv file
print("Shape (rows, cols):", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nDtypes:\n", df.dtypes)

Shape (rows, cols): (19735, 29)

Columns:
 ['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2']

Dtypes:
 date               str
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
dtype: object


In [5]:
# Identify timestamp, parse, set index
if DATE_COL not in df.columns:
    raise ValueError(f"Expected '{DATE_COL}' column not found.")

df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
bad_dates = int(df[DATE_COL].isna().sum())
print("\nBad/unparseable dates:", bad_dates)

# Drop rows with invalid timestamps (if any)
df = df.dropna(subset=[DATE_COL])

# Sort by time and set as index
df = df.sort_values(DATE_COL).set_index(DATE_COL)

# Verify 10-minute interval consistency
diffs = df.index.to_series().diff().dropna()
expected_delta = pd.Timedelta(EXPECTED_FREQ)
pct_expected = (diffs == expected_delta).mean() * 100

print("\nTime range:", df.index.min(), "->", df.index.max())
print(f"% intervals equal to {EXPECTED_FREQ}:", round(pct_expected, 2))
print("Most common time gaps:\n", diffs.value_counts().head(5))


Bad/unparseable dates: 0

Time range: 2016-01-11 17:00:00 -> 2016-05-27 18:00:00
% intervals equal to 10min: 100.0
Most common time gaps:
 date
0 days 00:10:00    19734
Name: count, dtype: int64


In [6]:
# Remove duplicates (rows and timestamps)
dup_rows = int(df.duplicated().sum())
dup_timestamps = int(df.index.duplicated().sum())

print("\nDuplicate rows found:", dup_rows)
print("Duplicate timestamps found:", dup_timestamps)

if dup_rows > 0:
    df = df.drop_duplicates()

if dup_timestamps > 0:
    df = df[~df.index.duplicated(keep="first")]


Duplicate rows found: 0
Duplicate timestamps found: 0


In [7]:
# Missing value handling:
# - Numeric -> mean
# - Categorical -> mode (most frequent)
missing_before = df.isna().sum().sum()
print("\nMissing before fill:", int(missing_before))

num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns

# Fill numeric NaNs with mean
for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# Fill categorical NaNs with mode
for c in cat_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mode(dropna=True).iloc[0])


Missing before fill: 0


In [9]:
# Gap check (missing timestamps)
# If gaps exist, document and decide interpolation vs removal
full_index = pd.date_range(df.index.min(), df.index.max(), freq=EXPECTED_FREQ)
missing_timestamps = full_index.difference(df.index)

print("\nMissing timestamps (gaps):", len(missing_timestamps))

# 6) Final check: confirm no NaNs remain
missing_after = int(df.isna().sum().sum())
print("\nMissing after fill:", missing_after)

assert missing_after == 0, "There are still missing values after cleaning!"

print("\nCleaning complete. Dataset is time-indexed and contains no missing values.")


Missing timestamps (gaps): 0

Missing after fill: 0

Cleaning complete. Dataset is time-indexed and contains no missing values.
