In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("D:\EDGE AI\Edge-AI-1\Data\iot_telemetry_data.csv")

# Check for missing values
print("Missing :\n", df.isnull().sum())

Missing :
 ts          0
device      0
co          0
humidity    0
light       0
lpg         0
motion      0
smoke       0
temp        0
dtype: int64


In [6]:
# Convert timestamp column
df["ts"] = pd.to_datetime(df["ts"])
df = df.set_index("ts")

features = ["temp", "humidity", "co", "lpg", "smoke"]
data = df[features]

# Checking for outliers
def check_outliers(df, features):
    outlier_counts = {}
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        outlier_counts[feature] = len(outliers)
    return outlier_counts

# Outlier counts
outlier_counts = check_outliers(df, features)
print("Outliers:\n", outlier_counts)

Outliers:
 {'temp': 8616, 'humidity': 44, 'co': 10480, 'lpg': 12624, 'smoke': 12271}


In [7]:
from sklearn.impute import KNNImputer

# Marking outliers as NaN for imputation
def mark_outliers(df, features):
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df.loc[(df[feature] < lower_bound) | (df[feature] > upper_bound), feature] = np.nan
    return df

# Apply outlier marking
df = mark_outliers(df, features)

# Check NaN counts after marking outliers
print("Outliers replaced with NaN:\n", df.isnull().sum())
print("")

#KNN Imputation
imputer = KNNImputer(n_neighbors=5)
df[features] = imputer.fit_transform(df[features])

# Checking if outliers are handled
print("Outliers after KNN Imputation:\n", df.isnull().sum())

df.to_csv("cleaned_data.csv", index=True)


Outliers replaced with NaN:
 device          0
co          10480
humidity       44
light           0
lpg         12624
motion          0
smoke       12271
temp         8616
dtype: int64

Outliers after KNN Imputation:
 device      0
co          0
humidity    0
light       0
lpg         0
motion      0
smoke       0
temp        0
dtype: int64
