In [None]:
# Explore more methods for handling missing data, including imputation strategies. 
import pandas as pd
from sklearn.impute import SimpleImputer

# Create a DataFrame with missing values
data = {'Name': ['Alice', 'Bob', None], 'Age': [25, None, 35], 'Income': [50000, None, 70000]}
df = pd.DataFrame(data)

# Strategy: Fill missing values with the mean for numerical columns
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])

# Fill missing categorical data with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
df['Name'] = imputer.fit_transform(df[['Name']])

print("Data after Imputation:\n", df)

In [None]:
# Detect outliers using the IQR method and handle them by removing or capping the values.

import numpy as np

# Example dataset with outliers
data = {'Income': [50000, 60000, 55000, 2000000, 58000, 59000, 54000]}  # Outlier: 2000000
df = pd.DataFrame(data)

# Calculate the Interquartile Range (IQR)
Q1 = df['Income'].quantile(0.25)
Q3 = df['Income'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_no_outliers = df[(df['Income'] >= lower_bound) & (df['Income'] <= upper_bound)]

print("Data without outliers:\n", df_no_outliers)

In [None]:
# Apply transformations such as log transformation or box-cox to normalize skewed data.
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# Create a dataset with skewed data
data = {'Salary': [1000, 1500, 2000, 2500, 10000]}  # Skewed data
df = pd.DataFrame(data)

# Log transformation to reduce skewness
log_transformer = FunctionTransformer(np.log1p, validate=True)
df['Salary_log'] = log_transformer.transform(df[['Salary']])

print("Log Transformed Data:\n", df)

In [None]:
# Create new features from existing data to improve model performance.

# Example dataset with date column
data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Date_of_Birth': ['1990-01-01', '1985-05-15', '2000-07-20']}
df = pd.DataFrame(data)

# Convert Date_of_Birth to datetime
df['Date_of_Birth'] = pd.to_datetime(df['Date_of_Birth'])

# Create a new feature: Age
df['Age'] = pd.Timestamp.now().year - df['Date_of_Birth'].dt.year

print("Data with New Feature (Age):\n", df)