# Data cleaning 

In [9]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('../assets/data/store.csv')

# Separate numeric and non-numeric columns
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Handle missing values for numeric columns
for col in numeric_cols:
    data[col].fillna(data[col].median(), inplace=True)

# Handle missing values for categorical columns
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Remove outliers using IQR
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Replace outliers with NaN to match the original length
        df[col] = df[col].where((df[col] >= lower_bound) & (df[col] <= upper_bound), np.nan)
    return df

# Apply outlier removal
data = remove_outliers(data, numeric_cols)

# Impute missing values (including outliers treated as NaN) with median
for col in numeric_cols:
    data[col].fillna(data[col].median(), inplace=True)

# Save the cleaned data
data.to_csv('../assets/data/cleaned_data.csv', index=False)

print("Data cleaning complete. Cleaned data saved as 'cleaned_data.csv'")


Data cleaning complete. Cleaned data saved as 'cleaned_data.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin