In [3]:
#Extract Data & Time Features

import pandas as pd

#Sample DataSet

df = pd.DataFrame({
    'TransactionDate' : pd.to_datetime(['2025-02-05 14:30:00','2025-02-06 18:45:00'])
})

#Extract Date-Related Features
df['DayOfWeek'] = df['TransactionDate'].dt.dayofweek     #Monday = 0 Sunday = 6
df['Hour'] = df['TransactionDate'].dt.hour     #Extract Hour
df['Weekend'] = df['DayOfWeek'].apply(lambda x : 1 if(x >=5) else 0)

df

Unnamed: 0,TransactionDate,DayOfWeek,Hour,Weekend
0,2025-02-05 14:30:00,2,14,0
1,2025-02-06 18:45:00,3,18,0


In [5]:
#Aggregated Features
#Find avg transactions amount per user
df_transactions = pd.DataFrame({
    'UserID' : [101,102,101,103,102],
    'TransactionAmount' : [500,300,700,1000,400]
})


df_user_avg = df_transactions.groupby('UserID')['TransactionAmount'].mean().reset_index()

In [11]:
df_user_avg.rename(columns={'TransactionAmount' : 'AvgTransactionAmount'},inplace=True)
df_user_avg

Unnamed: 0,UserID,AvgTransactionAmount
0,101,600.0
1,102,350.0
2,103,1000.0


In [23]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

df = pd.DataFrame({
    'ProductCategory' : ['Electronics', 'Clothing', 'Electronics', 'Electronics', 'Clothing']
})

# Update: Use sparse_output=False instead of sparse=False
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(df[['ProductCategory']])

# Create DataFrame with the encoded features
df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
df_encoded


Unnamed: 0,ProductCategory_Clothing,ProductCategory_Electronics
0,0.0,1.0
1,1.0,0.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0


In [27]:
#Log Transformation for Skewed Data
#If TransactionAmount has outliers, apply log transformation:

import numpy as np
df = pd.DataFrame({
    'TransactionAmount' : [100,200,5000,10000,20000]
})
df['LogTransactionAmount'] = np.log1p(df['TransactionAmount']) #log1p avoids log(0) issues
df

#Why ? Reduces skewness and impact of outliers

Unnamed: 0,TransactionAmount,LogTransactionAmount
0,100,4.615121
1,200,5.303305
2,5000,8.517393
3,10000,9.21044
4,20000,9.903538


In [43]:
#Feature Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
df['NormalizedTransactionAmount'] = scaler.fit_transform(df[['TransactionAmount']])

standard_scaler = StandardScaler()
df['StandardizedTransactionAmount'] = standard_scaler.fit_transform(df[['TransactionAmount']])


df

#Why ? Ensures all features have the same scale, preventing bias in ML models

Unnamed: 0,TransactionAmount,LogTransactionAmount,NOrmalizedTransactionAmount,StandardizedTransactionAmount,NormalizedTransactionAmount
0,100,4.615121,0.0,-0.93707,0.0
1,200,5.303305,0.005025,-0.923606,0.005025
2,5000,8.517393,0.246231,-0.277351,0.246231
3,10000,9.21044,0.497487,0.395831,0.497487
4,20000,9.903538,1.0,1.742196,1.0


**Final Summary of Feature Engineering & Imbalanced Data Handling**
 
Feature Extraction : Extract new insights from raw data (e.g., Hour, DayOfWeek)
 
Aggregated Features : Calculate meaningful statistics (e.g., AvgTransactionAmountPerUser)
 
Encoding : Convert categorical variables into numerical (One-Hot Encoding)
 
Log Transformation : Reduce skewness in data distribution
 
Feature Scaling : Normalize numerical features for better model performance
 
Downsampling: Reduce the size of the majority class
 
Upsampling : Increase the size of the minority class
 
SMOTE(Synthetic Minority Over-sampling Technique) : Generate synthetic samples for the minority class