In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler

print("Libraries loaded successfully!")


Libraries loaded successfully!


In [4]:
# Create a dummy dataset with missing values and outliers
np.random.seed(0)  # For reproducibility
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # Normally distributed with an outlier
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # Random integers
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # Categorical with some missing values
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

# Convert to a pandas DataFrame
df_dummy = pd.DataFrame(dummy_data)

# Display the first 5 rows
print("First 5 rows of the dummy dataset:")
print(df_dummy.head())


First 5 rows of the dummy dataset:
     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [6]:
# Function to load data (already in memory)
def load_data(df):
    return df

# Function to handle missing values (fills numeric with mean)
def handle_missing_values(df):
    df_numeric = df.select_dtypes(include=[np.number]).fillna(df.mean())  # Fill numeric columns with mean
    df_categorical = df.select_dtypes(exclude=[np.number]).fillna(df.mode().iloc[0])  # Fill categorical with mode
    return pd.concat([df_numeric, df_categorical], axis=1)

# Function to remove outliers using Z-score
def remove_outliers(df):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
    return df[(z_scores < 3).all(axis=1)]  # Keep rows where all Z-scores < 3

# Function to scale numerical data (Min-Max Scaling)
def scale_data(df):
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df.select_dtypes(include=[np.number])), columns=df.select_dtypes(include=[np.number]).columns)
    return df_scaled

# Function to encode categorical variables
def encode_categorical(df):
    return pd.get_dummies(df, columns=['Category'])  # One-hot encoding

# Function to save the preprocessed data
def save_data(df, filename):
    df.to_csv(filename, index=False)
    print(f"Data saved as {filename}")


In [8]:
# Function to handle missing values properly
def handle_missing_values(df):
    # Select only numeric columns and fill missing values with their mean
    df_numeric = df.select_dtypes(include=[np.number]).apply(lambda x: x.fillna(x.mean()))
    
    # Select categorical columns and fill missing values with the most frequent value (mode)
    df_categorical = df.select_dtypes(exclude=[np.number]).apply(lambda x: x.fillna(x.mode()[0]))
    
    # Merge the cleaned numeric and categorical data back together
    df_cleaned = pd.concat([df_numeric, df_categorical], axis=1)
    
    return df_cleaned
