In [1]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

In [34]:
trns = pd.read_csv('../ieee-fraud-detection/transaction_ksy.csv', engine = 'python')

In [5]:
trns.columns

Index(['Unnamed: 0', 'TransactionID', 'isFraud', 'TransactionDT',
       'TransactionAmt', 'card1', 'TransactionAmt_residue', 'repeated',
       'ProductCD_C', 'ProductCD_H', 'ProductCD_R', 'ProductCD_S',
       'ProductCD_W', 'card2_na', 'card3_ran', 'card3_fraud_ratio',
       'card4_ran_american express', 'card4_ran_discover',
       'card4_ran_mastercard', 'card4_ran_visa', 'card5_ran',
       'card5_fraud_ratio', 'card6_fraud_ratio', 'addr1_na', 'addr2_ran',
       'addr2_fraud_ratio', 'dist1_na', 'dist2_na', 'C_pca1', 'C_pca2'],
      dtype='object')

In [35]:
trns.drop('Unnamed: 0', axis = 1, inplace = True)

In [7]:
trns.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,TransactionAmt_residue,repeated,ProductCD_C,ProductCD_H,ProductCD_R,...,card5_ran,card5_fraud_ratio,card6_fraud_ratio,addr1_na,addr2_ran,addr2_fraud_ratio,dist1_na,dist2_na,C_pca1,C_pca2
0,2987000,0,86400,68.5,13926,0,1,0,0,0,...,142.0,0.043011,0.066701,0,87.0,0.034394,0,1,-0.032662,-0.006091
1,2987001,0,86401,29.0,2755,0,8,0,0,0,...,102.0,0.06276,0.066701,0,87.0,0.034394,1,1,-0.031442,-0.018915
2,2987002,0,86469,59.0,4663,0,87,0,0,0,...,166.0,0.011362,0.024253,0,87.0,0.034394,0,1,-0.034455,-0.012628
3,2987003,0,86499,50.0,18132,0,17,0,0,0,...,117.0,0.01405,0.024253,0,87.0,0.034394,1,1,-0.026108,0.010789
4,2987004,0,86506,50.0,4497,0,2,0,1,0,...,102.0,0.06276,0.066701,0,87.0,0.034394,1,1,-0.012795,-0.025037


In [9]:
list(trns.columns)

['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'card1',
 'TransactionAmt_residue',
 'repeated',
 'ProductCD_C',
 'ProductCD_H',
 'ProductCD_R',
 'ProductCD_S',
 'ProductCD_W',
 'card2_na',
 'card3_ran',
 'card3_fraud_ratio',
 'card4_ran_american express',
 'card4_ran_discover',
 'card4_ran_mastercard',
 'card4_ran_visa',
 'card5_ran',
 'card5_fraud_ratio',
 'card6_fraud_ratio',
 'addr1_na',
 'addr2_ran',
 'addr2_fraud_ratio',
 'dist1_na',
 'dist2_na',
 'C_pca1',
 'C_pca2']

# card columns

In [36]:
cards = ['card1', 'card2_na',
 'card3_ran',
 'card3_fraud_ratio',
 'card4_ran_american express',
 'card4_ran_discover',
 'card4_ran_mastercard',
 'card4_ran_visa',
 'card5_ran',
 'card5_fraud_ratio',
 'card6_fraud_ratio']
c = trns[cards]

In [20]:
c.head()

Unnamed: 0,card1,card2_na,card3_ran,card3_fraud_ratio,card4_ran_american express,card4_ran_discover,card4_ran_mastercard,card4_ran_visa,card5_ran,card5_fraud_ratio,card6_fraud_ratio
0,13926,1,150.0,0.02457,0,1,0,0,142.0,0.043011,0.066701
1,2755,0,150.0,0.02457,0,0,1,0,102.0,0.06276,0.066701
2,4663,0,150.0,0.02457,0,0,0,1,166.0,0.011362,0.024253
3,18132,0,150.0,0.02457,0,0,1,0,117.0,0.01405,0.024253
4,4497,0,150.0,0.02457,0,0,1,0,102.0,0.06276,0.066701


In [37]:
pca = PCA(n_components = 2)
pca.fit(c)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [22]:
pca.components_

array([[-9.99999694e-01,  8.33730350e-07, -6.65636934e-06,
        -2.49699467e-11,  1.48115628e-07,  2.42582214e-06,
         5.79118640e-07, -3.15305640e-06,  7.82809803e-04,
        -3.51099510e-07,  7.77959834e-08],
       [-7.82227308e-04,  1.11533627e-04,  4.69802648e-02,
         1.47481311e-04,  3.08384966e-04,  4.31785482e-04,
         3.73376404e-03, -4.47393448e-03, -9.98878342e-01,
         1.48149777e-04,  7.56445982e-05]])

In [38]:
c_pca = pca.transform(c)
print("original shape:   ", c.shape)
print("transformed shape:", c_pca.shape)

original shape:    (590540, 11)
transformed shape: (590540, 2)


In [39]:
c_pca_df = pd.DataFrame(data = c_pca, columns = ['card_pca1', 'card_pca2'])

c_pca_df.head(10)

Unnamed: 0,card_pca1,card_pca2
0,-4027.308921,53.917676
1,7143.656341,102.614264
2,5235.707022,37.185342
3,-8233.327205,75.602769
4,5401.656875,101.251624
5,3961.754381,-23.743913
6,-2409.290636,31.205214
7,-2796.243548,-29.030205
8,7095.75342,-21.292413
9,-7500.243669,-30.703837


## Merge card PCA columns to dataframe

In [40]:
trns.drop(columns = cards, inplace = True)

In [41]:
trns['tmp'] = range(trns.shape[0])
c_pca_df['tmp'] = range(c_pca_df.shape[0])

In [42]:
trns1 = trns.merge(c_pca_df, on=['tmp'])
trns1.drop('tmp', axis=1, inplace = True)

In [43]:
trns1.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,TransactionAmt_residue,repeated,ProductCD_C,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,addr1_na,addr2_ran,addr2_fraud_ratio,dist1_na,dist2_na,C_pca1,C_pca2,card_pca1,card_pca2
0,2987000,0,86400,68.5,0,1,0,0,0,0,1,0,87.0,0.034394,0,1,-0.032662,-0.006091,-4027.308921,53.917676
1,2987001,0,86401,29.0,0,8,0,0,0,0,1,0,87.0,0.034394,1,1,-0.031442,-0.018915,7143.656341,102.614264
2,2987002,0,86469,59.0,0,87,0,0,0,0,1,0,87.0,0.034394,0,1,-0.034455,-0.012628,5235.707022,37.185342
3,2987003,0,86499,50.0,0,17,0,0,0,0,1,0,87.0,0.034394,1,1,-0.026108,0.010789,-8233.327205,75.602769
4,2987004,0,86506,50.0,0,2,0,1,0,0,0,0,87.0,0.034394,1,1,-0.012795,-0.025037,5401.656875,101.251624


# Product columns

In [44]:
products = ['ProductCD_C',
 'ProductCD_H',
 'ProductCD_R',
 'ProductCD_S',
 'ProductCD_W',]

p = trns[products]

In [45]:
pca = PCA(n_components = 1)
pca.fit(p)

PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [46]:
p_pca = pca.transform(p)
print("original shape:   ", p.shape)
print("transformed shape:", p_pca.shape)

original shape:    (590540, 5)
transformed shape: (590540, 1)


In [47]:
p_pca_df = pd.DataFrame(data = p_pca, columns = ['Product_pca1'])

p_pca_df.head(10)

Unnamed: 0,Product_pca1
0,-0.292694
1,-0.292694
2,-0.292694
3,-0.292694
4,0.714681
5,-0.292694
6,-0.292694
7,-0.292694
8,0.714681
9,-0.292694


## Merge product PCA columns to dataframe

In [48]:
trns1.drop(columns = products, inplace = True)

In [49]:
trns1['tmp'] = range(trns1.shape[0])
p_pca_df['tmp'] = range(p_pca_df.shape[0])

In [50]:
trns2 = trns1.merge(p_pca_df, on=['tmp'])
trns2.drop('tmp', axis=1, inplace = True)

In [51]:
trns2.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,TransactionAmt_residue,repeated,addr1_na,addr2_ran,addr2_fraud_ratio,dist1_na,dist2_na,C_pca1,C_pca2,card_pca1,card_pca2,Product_pca1
0,2987000,0,86400,68.5,0,1,0,87.0,0.034394,0,1,-0.032662,-0.006091,-4027.308921,53.917676,-0.292694
1,2987001,0,86401,29.0,0,8,0,87.0,0.034394,1,1,-0.031442,-0.018915,7143.656341,102.614264,-0.292694
2,2987002,0,86469,59.0,0,87,0,87.0,0.034394,0,1,-0.034455,-0.012628,5235.707022,37.185342,-0.292694
3,2987003,0,86499,50.0,0,17,0,87.0,0.034394,1,1,-0.026108,0.010789,-8233.327205,75.602769,-0.292694
4,2987004,0,86506,50.0,0,2,0,87.0,0.034394,1,1,-0.012795,-0.025037,5401.656875,101.251624,0.714681


In [52]:
list(trns2.columns)

['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'TransactionAmt_residue',
 'repeated',
 'addr1_na',
 'addr2_ran',
 'addr2_fraud_ratio',
 'dist1_na',
 'dist2_na',
 'C_pca1',
 'C_pca2',
 'card_pca1',
 'card_pca2',
 'Product_pca1']

# addr columns

In [53]:
addr = ['addr1_na',
 'addr2_ran',
 'addr2_fraud_ratio']

a = trns[addr]

In [54]:
pca = PCA(n_components = 1)
pca.fit(a)

PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [55]:
a_pca = pca.transform(a)
print("original shape:   ", a.shape)
print("transformed shape:", a_pca.shape)

original shape:    (590540, 3)
transformed shape: (590540, 1)


In [56]:
a_pca_df = pd.DataFrame(data = a_pca, columns = ['addr_pca1'])

a_pca_df.head(10)

Unnamed: 0,addr_pca1
0,-0.200391
1,-0.200391
2,-0.200391
3,-0.200391
4,-0.200391
5,-0.200391
6,-0.200391
7,-0.200391
8,-0.200391
9,-0.200391


## Merge addr PCA columns to dataframe

In [58]:
trns2.drop(columns = addr, inplace = True)

In [59]:
trns2['tmp'] = range(trns2.shape[0])
a_pca_df['tmp'] = range(a_pca_df.shape[0])

In [60]:
trns3 = trns2.merge(a_pca_df, on=['tmp'])
trns3.drop('tmp', axis=1, inplace = True)

In [61]:
trns3.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,TransactionAmt_residue,repeated,dist1_na,dist2_na,C_pca1,C_pca2,card_pca1,card_pca2,Product_pca1,addr_pca1
0,2987000,0,86400,68.5,0,1,0,1,-0.032662,-0.006091,-4027.308921,53.917676,-0.292694,-0.200391
1,2987001,0,86401,29.0,0,8,1,1,-0.031442,-0.018915,7143.656341,102.614264,-0.292694,-0.200391
2,2987002,0,86469,59.0,0,87,0,1,-0.034455,-0.012628,5235.707022,37.185342,-0.292694,-0.200391
3,2987003,0,86499,50.0,0,17,1,1,-0.026108,0.010789,-8233.327205,75.602769,-0.292694,-0.200391
4,2987004,0,86506,50.0,0,2,1,1,-0.012795,-0.025037,5401.656875,101.251624,0.714681,-0.200391


# dist columns

In [62]:
dist = ['dist1_na', 'dist2_na']

d = trns[dist]

In [63]:
pca = PCA(n_components = 1)
pca.fit(d)

PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [64]:
d_pca = pca.transform(d)
print("original shape:   ", d.shape)
print("transformed shape:", d_pca.shape)

original shape:    (590540, 2)
transformed shape: (590540, 1)


In [65]:
d_pca_df = pd.DataFrame(data = d_pca, columns = ['dist_pca1'])

d_pca_df.head(10)

Unnamed: 0,dist_pca1
0,0.59961
1,-0.390832
2,0.59961
3,-0.390832
4,-0.390832
5,0.59961
6,0.59961
7,-0.390832
8,-0.390832
9,0.59961


## Merge product PCA columns to dataframe

In [66]:
trns3.drop(columns = dist, inplace = True)

In [67]:
trns3['tmp'] = range(trns3.shape[0])
d_pca_df['tmp'] = range(d_pca_df.shape[0])

In [68]:
trns4 = trns3.merge(d_pca_df, on=['tmp'])
trns4.drop('tmp', axis=1, inplace = True)

In [69]:
trns4.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,TransactionAmt_residue,repeated,C_pca1,C_pca2,card_pca1,card_pca2,Product_pca1,addr_pca1,dist_pca1
0,2987000,0,86400,68.5,0,1,-0.032662,-0.006091,-4027.308921,53.917676,-0.292694,-0.200391,0.59961
1,2987001,0,86401,29.0,0,8,-0.031442,-0.018915,7143.656341,102.614264,-0.292694,-0.200391,-0.390832
2,2987002,0,86469,59.0,0,87,-0.034455,-0.012628,5235.707022,37.185342,-0.292694,-0.200391,0.59961
3,2987003,0,86499,50.0,0,17,-0.026108,0.010789,-8233.327205,75.602769,-0.292694,-0.200391,-0.390832
4,2987004,0,86506,50.0,0,2,-0.012795,-0.025037,5401.656875,101.251624,0.714681,-0.200391,-0.390832


# Export

In [70]:
trns4.to_csv('transaction_ksy_pca.csv', encoding = 'utf8')