# Pre-processing and Feature Engineeing

In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parent

sys.path.append(str(project_root))

In [2]:
import gc
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from src.preprocess import remove_inf, preprocess_data

In [None]:
train_identity = pd.read_csv('../data/train_identity.csv')
identity_columns = train_identity.columns.difference(['TransactionID'])

Identity dataset load sucessfully!


In [4]:
merged_chunks = []
transaction_chunks = pd.read_csv('../data/train_transaction.csv', chunksize=10 ** 5)

for chunk in transaction_chunks:
    new_chunk = chunk.merge(train_identity, on='TransactionID', how='left')
    merged_chunks.append(new_chunk)

train = pd.concat(merged_chunks)
train.shape

(590540, 434)

In [5]:
train.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [6]:
del train_identity, transaction_chunks

gc.collect()

0

### Feature Engineering

In [9]:
train['hasIdentity'] = (train[identity_columns].isna().all(axis=1) == False).astype(int)

In [10]:
amount = 'TransactionAmt'
selected_cols = ['card1', 'card4', 'P_emaildomain', 'R_emaildomain', 'addr1']

for col in selected_cols:
    train[f'amount_mean_{col}'] = train[amount] / train.groupby([col])[amount].transform('mean')
    train[f'amount_std_{col}'] = train[amount] / train.groupby([col])[amount].transform('std')

train['TransactionAmtLog'] = np.log(train[amount])
train['TransactionAmtCents'] = (train[amount] - np.floor(train[amount])).astype(np.float64)

In [12]:
start_date = datetime.strptime('2022-01-01', '%Y-%m-%d')

train['Date'] = train['TransactionDT'].apply(lambda dt: start_date + timedelta(seconds=dt))

train['Weekday'] = train['Date'].dt.dayofweek
train['Day'] = train['Date'].dt.day
train['Hour'] = train['Date'].dt.hour

train = train.drop(columns=['Date'])

In [13]:
train[['TransactionDT', 'Weekday', 'Day', 'Hour']].head()

Unnamed: 0,TransactionDT,Weekday,Day,Hour
0,86400,6,2,0
1,86401,6,2,0
2,86469,6,2,0
3,86499,6,2,0
4,86506,6,2,0


In [14]:
train[['TransactionDT', 'Weekday', 'Day', 'Hour']].tail()

Unnamed: 0,TransactionDT,Weekday,Day,Hour
90535,15811047,5,2,23
90536,15811049,5,2,23
90537,15811079,5,2,23
90538,15811088,5,2,23
90539,15811131,5,2,23


In [16]:
train.shape

(590540, 450)

In [18]:
train.to_parquet('../data/train.parquet')