# Importing required libraries


In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np


# Load the dataset


In [2]:
df = pd.read_csv('../data/cleaned_xente_data.csv')

## Creating Aggregate Features


In [3]:
df['Total_Transaction_Amount'] = df.groupby('CustomerId')['Amount'].transform('sum')
df['Average_Transaction_Amount'] = df.groupby('CustomerId')['Amount'].transform('mean')
df['Transaction_Count'] = df.groupby('CustomerId')['TransactionId'].transform('count')
df['Transaction_Std'] = df.groupby('CustomerId')['Amount'].transform('std')

## Extracting Date-Time Features


In [4]:
df['Transaction_Hour'] = pd.to_datetime(df['TransactionStartTime']).dt.hour
df['Transaction_Day'] = pd.to_datetime(df['TransactionStartTime']).dt.day
df['Transaction_Month'] = pd.to_datetime(df['TransactionStartTime']).dt.month
df['Transaction_Year'] = pd.to_datetime(df['TransactionStartTime']).dt.year

## Encoding Categorical Variables using One-Hot Encoding


In [5]:
categorical_vars = ['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductCategory']
df_encoded = pd.get_dummies(df, columns=categorical_vars)

## Handling Missing Values


In [None]:
numeric_cols = df_encoded.select_dtypes(include=['float64', 'int64']).columns

# Create the SimpleImputer
imputer = SimpleImputer(strategy='mean')
# Apply imputer only to the numeric columns
df_encoded[numeric_cols] = imputer.fit_transform(df_encoded[numeric_cols])

# If you want to check if there are still missing value
print(df_encoded.isnull().sum())

## Normalizing/Standardizing Numerical Features


In [12]:
scaler = StandardScaler()
df_encoded[df_encoded.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df_encoded.select_dtypes(include=[np.number]))


In [13]:
# Save feature-engineered data
df_encoded.to_csv('feature_engineered_data.csv', index=False)

print("Feature Engineering Completed.")

Feature Engineering Completed.
