# üîÑ Data Transformation with Pandas
This notebook covers a range of data transformation techniques including missing value handling, outlier detection, type conversion, datetime normalization, and feature scaling.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Create sample dataset
data = {
    'age': [25, 30, np.nan, 40, 1000, 35],
    'income': [50000, 60000, 70000, np.nan, 90000, -100000],
    'gender': ['Male', 'Female', 'Female', 'Male', 'Other', None],
    'signup_date': ['2022-01-01', '2022-02-15', 'bad_date', '2022-04-10', '2022-05-05', '2022-06-01'],
    'country': ['UK', 'US', 'UK', 'US', 'FR', 'DE']
}
df = pd.DataFrame(data)
df

## üï≥ Handling Missing Values

In [None]:
# Drop rows with any NaN values
df_drop = df.dropna()
# Fill with mean or mode
df['age_fill_mean'] = df['age'].fillna(df['age'].mean())
df['gender_fill_mode'] = df['gender'].fillna(df['gender'].mode()[0])
df

## üö® Outlier Detection and Treatment

In [None]:
# Z-Score Method
from scipy import stats
z_scores = np.abs(stats.zscore(df[['age', 'income']].dropna()))
outliers_z = (z_scores > 3)
# IQR Method
Q1 = df['income'].quantile(0.25)
Q3 = df['income'].quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = (df['income'] < Q1 - 1.5 * IQR) | (df['income'] > Q3 + 1.5 * IQR)
df['income_capped'] = df['income'].clip(lower=Q1 - 1.5 * IQR, upper=Q3 + 1.5 * IQR)
df

## üîÑ Data Type Conversion

In [None]:
# Convert signup_date to datetime
df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')
# Convert gender to category
df['gender_cat'] = df['gender'].astype('category')
df.dtypes

## üïì Datetime Normalization

In [None]:
# Extract date components
df['signup_month'] = df['signup_date'].dt.month
df['signup_weekday'] = df['signup_date'].dt.day_name()
df[['signup_date', 'signup_month', 'signup_weekday']]

## üìè Feature Scaling

In [None]:
# Standardization (Z-score)
df['income_z'] = (df['income_capped'] - df['income_capped'].mean()) / df['income_capped'].std()
# Min-Max Normalization
df['income_minmax'] = (df['income_capped'] - df['income_capped'].min()) / (df['income_capped'].max() - df['income_capped'].min())
# Robust Scaling
df['income_robust'] = (df['income_capped'] - df['income_capped'].median()) / (df['income_capped'].quantile(0.75) - df['income_capped'].quantile(0.25))
df[['income', 'income_capped', 'income_z', 'income_minmax', 'income_robust']]

## üß™ Feature Engineering with Scikit-learn

In [None]:
# Prepare data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Sample DataFrame for sklearn
X = df[['age', 'income_capped', 'country']].copy()
X['age'] = X['age'].fillna(X['age'].mean())
X['income_capped'] = X['income_capped'].fillna(X['income_capped'].mean())
X['country'] = X['country'].fillna('Unknown')
y = np.random.randint(0, 2, size=X.shape[0])  # synthetic binary target

In [None]:
# Column transformations
numeric_features = ['age', 'income_capped']
categorical_features = ['country']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
# Full pipeline with model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LinearRegression())])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
clf.fit(X_train, y_train)
print('Model training completed.')

In [None]:
# Transform data only (without model)
X_transformed = preprocessor.fit_transform(X)
print('Transformed feature matrix shape:', X_transformed.shape)

In [None]:
# Polynomial Features example
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[['age_fill_mean']].fillna(0))
print('Polynomial features shape:', poly_features.shape)