This notebook applies dataengineering techniques to the dataset in order to scale down on features but still keep information as we currently overfit on the training data

In [1]:
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
train_df = pd.read_csv('../data/train_data_prepared.csv', index_col=None)
test_df = pd.read_csv('../data/test_data_prepared.csv', index_col=None)

Exclude the target column for now as we are currently only interested in relations between the features themself. We will use this column later on again to determine which features may only confuse the model with unneccessary low correlation to isFraud

In [3]:
# exclude isFraud
if 'isFraud' in train_df.columns:
    targetCol = train_df['isFraud']
    del train_df['isFraud']
print(targetCol)

0         0
1         0
2         0
3         0
4         0
         ..
590535    0
590536    0
590537    0
590538    0
590539    0
Name: isFraud, Length: 590540, dtype: int64


In [4]:
# check for matching columns
train_cols = list(train_df.columns)
test_cols = list(test_df.columns)

for col in train_cols:
    if col not in test_cols:
        print(f'column {col} in train_cols but not in test cols')

for col in test_cols:
    if col not in train_cols:
        print(f'column {col} in test_cols but not in train cols')

print(len(train_cols))
print(len(test_cols))

421
421


In [5]:
# itterate over all features and calculate the correlation
corr_features: dict[str, list[str]] = {}

train_columns = list(train_df.columns)
# print(train_df['V1','V2'].corr())
# NOTE: corr(A,B) = corr(B,A)
count = 0
# takes about 2 minutes to calculated
for feature in train_columns:
    for other_feature in train_columns:
        # skip when features are equal as they always have corr 1
        if feature != other_feature:
            corr = train_df[feature].corr(train_df[other_feature])
            count += 1
            if corr >= 0.5 or corr <= -0.5:
                print(f'corr between {feature} and {other_feature} is: {corr}')
                train_columns.remove(other_feature)
                if feature not in corr_features:
                    # as soon as we write the feature to the dict once we
                    # don't need it for the other features anymore
                    train_columns.remove(feature)
                    corr_features[feature] = []
                try:
                    corr_features[feature].append(other_feature)
                except KeyError:
                    print('KeyError')
print(f'calculated {count} correlations')
print(corr_features)

corr between TransactionID and TransactionDT is: 0.9982802236526589


corr between ProductCD and addr1 is: 0.7693750620050251
corr between ProductCD and R_emaildomain is: 0.5155265743714881
corr between ProductCD and D6 is: -0.7554728171553914
corr between ProductCD and D9 is: -0.5676034415402995
corr between ProductCD and D11 is: 0.5622558630725973
corr between ProductCD and D13 is: -0.7305736201826665
corr between ProductCD and M1 is: -0.5939891543256874
corr between ProductCD and M3 is: -0.53181231358868
corr between ProductCD and M6 is: -0.7387667230798127
corr between ProductCD and V1 is: 0.5777195244787136
corr between ProductCD and V3 is: 0.5777194616297984
corr between ProductCD and V5 is: 0.5777193861468608
corr between ProductCD and V7 is: 0.5777194677849946
corr between ProductCD and V9 is: 0.5777194932249331
corr between ProductCD and V11 is: 0.5777193379203246
corr between ProductCD and V167 is: -0.8659401922005097
corr between ProductCD and V169 is: -0.870930288275673
corr between ProductCD and V171 is: -0.8708613879442703
corr between Prod

Now a new dataset is constructed of the following shape:
- the remaining columns from the original dataset, that are not correlated to anything from dataset
- for each set of correlated columns we run pca as follows:
    - we run PCA on min(len(corr_features), 3)
    - we then select only those columns from the three that push us over 90% explained variance, so if the first pc already has 97% explained variance we only take this one, if it has 85 and the second has 5 we take the first two, if we have 85, 4, 2 we take all (if this case even exists)

In [6]:
# construct a set of new columns, consisting of the remaining columns from before
# set to prevent olumn doubling
new_columns = set(train_columns)

train_df_new = pd.DataFrame()
test_df_new = pd.DataFrame()

# now run pca for each of the correlated columns sets
for column in corr_features:
    # join the key + all its corr features
    corr_cols = [column] + corr_features[column]
    pca_train = PCA(n_components=min(len(corr_cols), 3))
    pca_test = PCA(n_components=min(len(corr_cols), 3))

    print(f'calculating principal components for {corr_cols}')
    # calculate the principal components
    principal_components_train = pca_train.fit_transform(train_df[corr_cols])
    principal_components_test = pca_test.fit_transform(test_df[corr_cols])
    # count the explained variance ratio and the indices for the pcs (range 1,2,3)
    ratio_cnt = 0
    idx = []
    for i in range(len(pca_train.explained_variance_ratio_)):
        # by appending first we allow to stop on the step that pushed us above 90%
        # we require at least 90% cumulated variance ratio
        if ratio_cnt < 0.9:
            idx.append(i)
        else:
            # if we are above 90% we break
            break
        ratio_cnt += pca_train.explained_variance_ratio_[i]

    print(f'princial components to explain {corr_cols} have a cumulated {ratio_cnt} explained\
 variance ratio, to get above {ratio_cnt} we need pcs {idx}')

    if ratio_cnt < 0.9:
        # for those columns that have less than 90 % explained variance ratio in
        # pca with 3 components we just add the original columns
        new_columns.union(set(corr_cols)) # set to prevent doubling
        # do not continue for those column sets
        continue

    print(principal_components_train.T)
    # build the new columns and add to the dataframe
    for i in idx:
        # transpose the pcs to add the entire column
        identifier = f'{column}_set_pc{i+1}'
        print(f'adding col {identifier} to new training set')
        train_df_new[identifier] = principal_components_train.T[i]
        print(f'adding col {identifier} to new test set')
        test_df_new[identifier] = principal_components_test.T[i]

# add the remaining original columns to the new datasets
train_df_new[list(new_columns)] = train_df[list(new_columns)]
test_df_new[list(new_columns)] = test_df[list(new_columns)]

# also add the target value to the train dataframe
train_df_new['isFraud'] = targetCol


calculating principal components for ['TransactionID', 'TransactionDT']
princial components to explain ['TransactionID', 'TransactionDT'] have a cumulated 0.9999953279933507 explained variance ratio, to get above 0.9999953279933507 we need pcs [0]
[[-7291843.00769379 -7291841.97153549 -7291773.98084503 ...
   8443917.02286219  8443926.05358815  8443969.06124281]
 [   26706.69787834    26705.73539012    26707.24071708 ...
     15758.5342535     15757.86642965    15758.45092999]]
adding col TransactionID_set_pc1 to new training set
adding col TransactionID_set_pc1 to new test set
calculating principal components for ['ProductCD', 'addr1', 'R_emaildomain', 'D6', 'D9', 'D11', 'D13', 'M1', 'M3', 'M6', 'V1', 'V3', 'V5', 'V7', 'V9', 'V11', 'V167', 'V169', 'V171', 'V173', 'V175', 'V177', 'V179', 'V181', 'V183', 'V185', 'V187', 'V189', 'V191', 'V193', 'V195', 'V197', 'V199', 'V201', 'V205', 'V207', 'V209', 'V214', 'V216', 'V218', 'V220', 'V222', 'V224', 'V226', 'V228', 'V230', 'V232', 'V234', '

Successfully lowered the complexity of the dataset from 423 features to 76

In [7]:
print(train_df_new.shape)
print(test_df_new.shape)

(590540, 77)
(506691, 76)


In [8]:
# write to new csv_files
train_df_new.to_csv('../data/train_data_downscaled.csv', index=False)
test_df_new.to_csv('../data/test_data_downscaled.csv', index=False)