In [None]:
import pandas as pd

# Load the dataset (replace 'employees.csv' with your actual file path if needed)
df = pd.read_csv('employees.csv')

# -------------------------------
# 1. Dropping Missing Data
# -------------------------------
print("Original DataFrame:\n", df)
print("\nMissing values per column:\n", df.isnull().sum())

# Drop rows with any missing data
df_dropped = df.dropna()
print("\nDataFrame after dropping rows with missing data:\n", df_dropped)

# Optional: Save to CSV
df_dropped.to_csv('employees_dropped.csv', index=False)

# -------------------------------
# 2. Imputation using Mean
# -------------------------------
df_mean_imputed = df.copy()

# Fill numeric columns with mean
numeric_cols = df_mean_imputed.select_dtypes(include='number').columns
for col in numeric_cols:
    df_mean_imputed[col].fillna(df_mean_imputed[col].mean(), inplace=True)

print("\nDataFrame after mean imputation:\n", df_mean_imputed)

# Optional: Save to CSV
df_mean_imputed.to_csv('employees_mean_imputed.csv', index=False)

# -------------------------------
# 3. Imputation using Median and Mode
# -------------------------------
df_median_mode_imputed = df.copy()

# Fill numeric columns with median
for col in df_median_mode_imputed.select_dtypes(include='number').columns:
    df_median_mode_imputed[col].fillna(df_median_mode_imputed[col].median(), inplace=True)

# Fill categorical columns with mode
for col in df_median_mode_imputed.select_dtypes(include='object').columns:
    if not df_median_mode_imputed[col].mode().empty:
        df_median_mode_imputed[col].fillna(df_median_mode_imputed[col].mode()[0], inplace=True)

print("\nDataFrame after median (numeric) and mode (categorical) imputation:\n", df_median_mode_imputed)

# Optional: Save to CSV
df_median_mode_imputed.to_csv('employees_median_mode_imputed.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load the dataset (replace with your path if needed)
df = pd.read_csv('employees.csv')

# -------------------------------
# 4. ML-based Imputation with SimpleImputer
# -------------------------------
df_simple_impute = df.copy()

# Select numeric columns for imputation
numeric_cols = df_simple_impute.select_dtypes(include='number').columns

# Apply SimpleImputer with mean strategy
imputer = SimpleImputer(strategy='mean')
df_simple_impute[numeric_cols] = imputer.fit_transform(df_simple_impute[numeric_cols])

print("Data after SimpleImputer (mean) imputation:\n", df_simple_impute.head())

# -------------------------------
# 5. Imputation using a Regression Model
# -------------------------------
df_regression = df.copy()

# Example: Predict missing values in 'Salary' column using 'Age' and 'Experience'
if 'Salary' in df.columns and df['Salary'].isnull().sum() > 0:
    regression_df = df[['Age', 'Experience', 'Salary']].copy()
    regression_df = regression_df.dropna(subset=['Age', 'Experience'])  # Drop rows where predictors are missing

    # Split into known and missing Salary
    train_data = regression_df[regression_df['Salary'].notnull()]
    test_data = df[df['Salary'].isnull() & df['Age'].notnull() & df['Experience'].notnull()]

    if not train_data.empty and not test_data.empty:
        model = LinearRegression()
        model.fit(train_data[['Age', 'Experience']], train_data['Salary'])

        predicted_salaries = model.predict(test_data[['Age', 'Experience']])
        df.loc[test_data.index, 'Salary'] = predicted_salaries

print("Data after Regression-based imputation:\n", df.head())

# -------------------------------
# 6. K-Nearest Neighbors Imputation
# -------------------------------
df_knn = df.copy()

# Select numeric columns only
numeric_cols = df_knn.select_dtypes(include='number').columns
knn_imputer = KNNImputer(n_neighbors=3)
df_knn[numeric_cols] = knn_imputer.fit_transform(df_knn[numeric_cols])

print("Data after KNNImputer imputation:\n", df_knn.head())
