# Extensive EDA — Data Cleaning Only (No Graphs)
This notebook performs deep exploratory data analysis focusing strictly on **data cleaning**, including:
- Missing value detection & handling
- Duplicate checking
- Outlier identification (no graphs)
- Data type correction
- String/category normalization
- Feature consistency checks
- Basic descriptive stats (non-visual)


In [None]:
import pandas as pd
import numpy as np

# Load your dataset here
df = pd.read_csv('your_dataset.csv')
df.head()

## 1. Basic Information & Structure

In [None]:
df.info()
df.describe(include='all')

## 2. Missing Value Analysis

In [None]:
missing_values = df.isnull().sum().sort_values(ascending=False)
percent_missing = (df.isnull().mean()*100).sort_values(ascending=False)
pd.concat([missing_values, percent_missing], axis=1, keys=['Missing Count', '% Missing'])

In [None]:
# Example missing value handling
df_cleaned = df.copy()

# Numeric columns - fill with median
                
num_cols = df_cleaned.select_dtypes(include=[np.number]).columns
df_cleaned[num_cols] = df_cleaned[num_cols].fillna(df_cleaned[num_cols].median())

# Categorical columns - fill with mode
cat_cols = df_cleaned.select_dtypes(include=['object']).columns
for col in cat_cols:
    df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])


## 3. Duplicate Detection & Removal

In [None]:
duplicates = df_cleaned.duplicated().sum()
print('Total Duplicates:', duplicates)

df_cleaned = df_cleaned.drop_duplicates()

## 4. Outlier Detection (Non-Graphical) — IQR Method

In [None]:
def detect_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return ((series < lower) | (series > upper)).sum()

outliers = {col: detect_outliers_iqr(df_cleaned[col]) for col in num_cols}
outliers

## 5. Data Type Corrections

In [None]:
df_cleaned.dtypes

In [None]:
# Example conversion
for col in df_cleaned.columns:
    if 'date' in col.lower():
        df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='coerce')

## 6. Category/Label Normalization

In [None]:
for col in cat_cols:
    df_cleaned[col] = df_cleaned[col].str.strip().str.lower()

## 7. Final Cleaned Data Preview

In [None]:
df_cleaned.head()

# 8. Replacing values

In [None]:
df['age'] = df['age'].mask(df['age'] < 0, df['age'].median())
df.replace({'gender': {'malee':'male'}}, inplace=True)

# 9. Class imbalances

In [None]:
from imblearn.over_sampling import SMOTE

X = df_cleaned.drop('target', axis=1)
y = df_cleaned['target']

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# 10. Encodings:

In [None]:
from sklearn.preprocessing import LabelEncoder
# certain catefories
'''
Converts each category into a unique integer.
Good for ordinal variables or binary categories.
'''
le = LabelEncoder()
df['color_label'] = le.fit_transform(df['color'])
print(df)

In [None]:
'''
Creates dummy variables for each category.
Avoids introducing order when none exists.
'''
df_onehot = pd.get_dummies(df, columns=['color'], drop_first=True)
print(df_onehot)

In [None]:
# Ordinal Encoding, when order matters:
from sklearn.preprocessing import OrdinalEncoder

df = pd.DataFrame({'size': ['small', 'medium', 'large', 'medium', 'small']})

encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']])
df['size_encoded'] = encoder.fit_transform(df[['size']])
print(df)

In [None]:
# frequency encoding
df = pd.DataFrame({'city': ['NY', 'LA', 'NY', 'SF', 'LA', 'NY']})
freq = df['city'].value_counts()/len(df)
df['city_freq'] = df['city'].map(freq)
print(df)

# 11. Standard Scaler:


In [None]:
num_cols = df.select_dtypes(include='number').columns

scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[num_cols] = scaler.fit_transform(df[num_cols])

df_scaled.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[num_cols] = scaler.fit_transform(df[num_cols])

df_scaled.head()