# Pre-processing and Feature Engineeing

In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parent

sys.path.append(str(project_root))

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from sklearn.decomposition import PCA

from config.features import get_drop_features
from src.preprocess import preprocess_data

In [3]:
train_identity = pd.read_csv('../data/train_identity.csv')
identity_columns = train_identity.columns.difference(['TransactionID'])

print("Identity dataset load sucessfully!")

Identity dataset load sucessfully!


In [4]:
merged_chunks = []
transaction_chunks = pd.read_csv('../data/train_transaction.csv', chunksize=10 ** 5)

for chunk in transaction_chunks:
    new_chunk = chunk.merge(train_identity, on='TransactionID', how='left')
    merged_chunks.append(new_chunk)

train = pd.concat(merged_chunks)
train.shape

(590540, 434)

In [5]:
train['hasIdentity'] = (~train[identity_columns].isna().all(axis=1)).astype(int)

In [6]:
train = train.drop(columns=get_drop_features(train.columns))
train.head(5)

Previous features: 435
Relevant features: 291
Final training set features: 283


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_30,id_31,id_32,id_33,id_36,id_37,id_38,DeviceType,DeviceInfo,hasIdentity
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,1


Make sure to delete the unnecessary variables from memory to relieve some of it, so it will not bother us throughout the script. Also, let's save this merged dataset to optimize future reading operations.

In [7]:
del train_identity, transaction_chunks

### Feature Selection

In [8]:
many_null_cols = [col for col in train.columns if (train[col].isnull().sum() / train.shape[0]) >= 0.8]
len(many_null_cols)

45

In [9]:
train = train.drop(columns=many_null_cols)
train.shape

(590540, 238)

### Feature Engineering

In [10]:
amount = 'TransactionAmt'
selected_cols = ['card1', 'card4', 'P_emaildomain', 'R_emaildomain', 'addr1']

for col in selected_cols:
    train[f'amount_mean_{col}'] = train[amount] / train.groupby([col])[amount].transform('mean')
    train[f'amount_std_{col}'] = train[amount] / train.groupby([col])[amount].transform('std')

train['TransactionAmtLog'] = np.log(train[amount])
train['TransactionAmtCents'] = (train[amount] - np.floor(train[amount])).astype(np.float64)

#### Time Delta Feature

TransactionDT: timedelta from a given reference datetime (not an actual timestamp)

In [11]:
start_date = datetime.strptime('2022-01-01', '%Y-%m-%d')

train['Date'] = train['TransactionDT'].apply(lambda dt: start_date + timedelta(seconds=dt))

train['Weekday'] = train['Date'].dt.dayofweek
train['Day'] = train['Date'].dt.day
train['Hour'] = train['Date'].dt.hour

train = train.drop(columns=['Date'])

In [12]:
train[['TransactionDT', 'Weekday', 'Day', 'Hour']].sample(10)

Unnamed: 0,TransactionDT,Weekday,Day,Hour
42374,5756908,1,8,15
70739,12155804,5,21,16
26661,8038898,0,4,1
73692,9312856,0,18,18
90285,4247433,5,19,3
65828,11993876,3,19,19
88204,15736432,5,2,3
49595,5948669,3,10,20
5390,10248503,4,29,14
67883,1539316,1,18,19


### PCA

In [33]:
X_train = train.drop(columns=['isFraud'])
X_train = X_train.replace([np.inf, -np.inf], -999)

X_train_transformed = preprocess_data(X_train)

pca = PCA()
pca.fit(X_train_transformed)

variance_ratio = pca.explained_variance_ratio_
variance_ratio

: 

In [24]:
print(train.shape)
print(X_train.shape)
print(X_train_transformed.shape)

(590540, 253)
(590540, 252)
(590540, 252)


In [28]:
cumsum = np.cumulative_sum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

print(cumsum)
print(cumsum >= 0.95)
print(d)

[0.9997053]
[ True]
1


In [32]:
pca.explained_variance_ratio_

array([0.9997053])

In [23]:
pca = PCA(n_components=d)

X_reduced = pca.fit_transform(X_train_transformed)

X_reduced.shape

(590540, 1)

In [18]:
X_reduced

array([[-7291835.58141392],
       [-7291834.58155166],
       [-7291766.5950606 ],
       ...,
       [ 8443908.28786489],
       [ 8443917.66462394],
       [ 8443960.39502615]])

In [17]:
train.to_parquet('../data/train.parquet', index=False)