In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Preprocessing_68/household_power_consumption.txt',sep=';', na_values=['?'])

In [None]:
# Combine Date and Time into datetime
df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S')
# Drop original Date and Time columns
df = df.drop(['Date', 'Time'], axis=1)
print(f"Original dataset shape: {df.shape}")
print(df.head())

Original dataset shape: (2075259, 8)
   Global_active_power  Global_reactive_power  Voltage  Global_intensity  \
0                4.216                  0.418   234.84              18.4   
1                5.360                  0.436   233.63              23.0   
2                5.374                  0.498   233.29              23.0   
3                5.388                  0.502   233.74              23.0   
4                3.666                  0.528   235.68              15.8   

   Sub_metering_1  Sub_metering_2  Sub_metering_3            datetime  
0             0.0             1.0            17.0 2006-12-16 17:24:00  
1             0.0             1.0            16.0 2006-12-16 17:25:00  
2             0.0             2.0            17.0 2006-12-16 17:26:00  
3             0.0             1.0            17.0 2006-12-16 17:27:00  
4             0.0             1.0            17.0 2006-12-16 17:28:00  


In [None]:
# Check how many missing values in each column
missing_counts = df.isna().sum()
missing_percent = (missing_counts / len(df)) * 100

print("Missing values per column:")
print(pd.DataFrame({"count": missing_counts, "percent": missing_percent.round(2)}))


Missing values per column:
                       count  percent
Global_active_power    25979     1.25
Global_reactive_power  25979     1.25
Voltage                25979     1.25
Global_intensity       25979     1.25
Sub_metering_1         25979     1.25
Sub_metering_2         25979     1.25
Sub_metering_3         25979     1.25
datetime                   0     0.00


In [None]:
# Drop rows where all columns are NaN
df = df.dropna(how='all')
print("After dropping fully empty rows:", df.shape)

After dropping fully empty rows: (2075259, 8)


In [None]:
# Forward fill short gaps
df_ffill = df.ffill(limit=5)

# Set the datetime column as the index for time-based interpolation
df_ffill = df_ffill.set_index('datetime')

# Interpolate remaining missing values using time method
df_clean = df_ffill.interpolate(method='time')

print("Remaining NaN values after cleaning:")
print(df_clean.isna().sum())

Remaining NaN values after cleaning:
Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64


In [None]:
#Saving
output_file = '/content/drive/MyDrive/Preprocessing_68/processed_after_impute.csv'
df_clean.to_csv(output_file)
print("Saved cleaned dataset to:", output_file)

Saved cleaned dataset to: /content/drive/MyDrive/Preprocessing_68/processed_after_impute.csv
