__<h1 style="text-align: center;font-size: 3rem">Feature Engineering</h1><h2 style="text-align: center;font-size: 1.3rem">(Notebook III)</h2>__

In [93]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from scipy import stats
import numpy as np
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from os import getenv

from itertools import chain
from collections import defaultdict

In [80]:
load_dotenv()
RANDOM_STATE = int(getenv("RANDOM_STATE", 0))

In [81]:
transactions = pd.read_parquet("../data/processed/creditcard.parquet")

## Transforming Existing Features

### Cyclic Time Features

Cyclic time features provide potential insight in routinely rhythm. This can be helpful if fraudulent transactions happen during a certain time of day.

In [82]:
print(f"Days: {transactions['time'].max() / (60 * 60 * 24)}")

Days: 1.9999074074074075


The transactions are recorded over a time period of 2 days. Cyclic features would be influenced by the hour of the day, seeing if the time of day may influence whether a transaction is likely to be fraudulent.

In [83]:
cyclic_features = pd.DataFrame(
    {
        "hour_sin": np.sin(2 * np.pi * transactions["time"] / (60 * 60 * 24)),
        "hour_cos": np.cos(2 * np.pi * transactions["time"] / (60 * 60 * 24)),
    }
)
cyclic_features.head(10)

Unnamed: 0,hour_sin,hour_cos
0,0.0,1.0
1,0.0,1.0
2,7.3e-05,1.0
3,7.3e-05,1.0
4,0.000145,1.0
5,0.000145,1.0
6,0.000291,1.0
7,0.000509,1.0
8,0.000509,1.0
9,0.000654,1.0


In [84]:
modified_transactions = pd.concat([transactions, cyclic_features], axis=1)
modified_transactions.head(10)

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v23,v24,v25,v26,v27,v28,amount,is_fraud,hour_sin,hour_cos
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,0.0,1.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0,0.0,1.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,7.3e-05,1.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0,7.3e-05,1.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0,0.000145,1.0
5,2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0,0.000145,1.0
6,4,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0,0.000291,1.0
7,7,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0,0.000509,1.0
8,7,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0,0.000509,1.0
9,9,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0,0.000654,1.0


In [85]:
X = modified_transactions.drop(columns=["is_fraud", "time"])
y = modified_transactions["is_fraud"]

In [86]:
X

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v22,v23,v24,v25,v26,v27,v28,amount,hour_sin,hour_cos
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.000000,1.000000
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.000000,1.000000
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.000073,1.000000
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.000073,1.000000
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.000145,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,-0.001018,0.999999
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,-0.000945,1.000000
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,-0.000873,1.000000
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,-0.000873,1.000000


In [87]:
X = (X - X.mean()) / X.std()
X

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v22,v23,v24,v25,v26,v27,v28,amount,hour_sin,hour_cos
0,-0.694241,-0.044075,1.672771,0.973364,-0.245116,0.347067,0.193679,0.082637,0.331127,0.083385,...,0.382854,-0.176911,0.110507,0.246585,-0.392170,0.330891,-0.063781,0.244964,0.424730,1.590387
1,0.608495,0.161176,0.109797,0.316522,0.043483,-0.061820,-0.063700,0.071253,-0.232494,-0.153349,...,-0.880075,0.162201,-0.561130,0.320693,0.261069,-0.022256,0.044607,-0.342474,0.424730,1.590387
2,-0.693499,-0.811576,1.169466,0.268231,-0.364571,1.351451,0.639775,0.207372,-1.378673,0.190699,...,1.063356,1.456317,-1.138090,-0.628536,-0.288446,-0.137137,-0.181021,1.160684,0.424846,1.590387
3,-0.493324,-0.112169,1.182514,-0.609726,-0.007469,0.936148,0.192070,0.316017,-1.262501,-0.050468,...,0.007267,-0.304776,-1.941024,1.241902,-0.460217,0.155396,0.186188,0.140534,0.424846,1.590387
4,-0.591329,0.531540,1.021410,0.284655,-0.295015,0.071998,0.479301,-0.226510,0.744325,0.691624,...,1.100009,-0.220123,0.233250,-0.395201,1.041609,0.543619,0.651815,-0.073403,0.424962,1.590387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-6.065831,6.099275,-6.486233,-1.459638,-3.886604,-1.956687,-3.975621,6.116562,1.742556,4.000708,...,0.154146,1.624571,-0.840999,2.756316,0.518499,2.337897,2.495525,-0.350150,0.423109,1.590386
284803,-0.374121,-0.033356,1.342142,-0.521651,0.629039,0.794444,0.019667,0.246886,0.532298,-0.896291,...,1.273779,0.019958,-1.677917,-1.163724,-0.819645,0.169641,-0.162163,-0.254116,0.423225,1.590386
284804,0.980022,-0.182433,-2.143201,-0.393983,1.905830,2.275258,-0.239939,0.593139,0.393630,-0.445224,...,0.796786,-0.060053,1.056942,0.509796,-0.181181,0.011037,-0.080467,-0.081839,0.423341,1.590386
284805,-0.122755,0.321250,0.463319,0.487192,-0.273836,0.468154,-0.554671,0.568630,0.356886,-0.366557,...,1.102449,-0.261503,0.203427,-1.091853,1.133633,0.269604,0.316686,-0.313248,0.423341,1.590386


In [88]:
X.shape, y.shape

((284807, 31), (284807,))

Unfortunately, since variables v1 to v28 are unknown in their subject and nature, only that they are continuous values that appear to potentially have normalization performed before ingestion. Additional feature transformations cannot be made.

In [94]:
skb = SelectKBest(k=10)
X_new = skb.fit_transform(X, y)

In [95]:
chosen_features = defaultdict(int)

In [96]:
selected_mask = skb.get_support()
selected_features = X.columns[selected_mask]
for feature in selected_features:
    chosen_features[feature] += 1

In [97]:
rfe = RFE(
    estimator=LogisticRegression(
        class_weight={0: 1, 1: 5},
        max_iter=1000,
        random_state=RANDOM_STATE,
    ),
    n_features_to_select=10,
)

rfe.fit(X, y)
rfe_mask = rfe.get_support()
rfe_features = X.columns[rfe_mask]
for feature in rfe_features:
    chosen_features[feature] += 1

In [98]:
rfe = RFE(
    estimator=SVC(
        kernel="linear",
        class_weight={0: 1, 1: 5},
        max_iter=1000,
        random_state=RANDOM_STATE,
    ),
    n_features_to_select=10,
)

rfe.fit(X, y)
rfe_mask = rfe.get_support()
rfe_features = X.columns[rfe_mask]
for feature in selected_features:
    chosen_features[feature] += 1



In [None]:
print(
    sorted(chosen_features.keys(), key=lambda x: chosen_features[x], reverse=True)[:10]
)

['v4', 'v10', 'v11', 'v12', 'v14', 'v16', 'v3', 'v7', 'v17', 'v18']


: 