# Imbalance task - Rank Swapping

#### Author: Michał Okoń

In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date # datetime works too
from currency_converter import CurrencyConverter

In [2]:
# If this cell does not work try running ``

import io
import nbformat

nb = nbformat.read("Lab1_Michal_Okon.ipynb", nbformat.NO_CONVERT)
word_count = 0
for cell in nb.cells:
    if cell.cell_type == "markdown":
        word_count += len(cell['source'].replace('#', '').lstrip().split(' '))

print("Word count:", word_count)

Word count: 288


### Rank-swapping algorithm implementation
In the implementation of the rank-swapping algorithm, I will be swapping the values of the numerical and ordinal features in our code. Moreover, I have added several new aggregated features to better utilize the rank swap algorithm. That is, I will be swapping amount of euros transferred in the transaction and the variables that related to the number of times certain ip/cards/emails have been used.

In [3]:
def rank_swap_custom(df, features_to_swap, swap_range=0.02):
    """
    Rank-swaps the values of the features in the dataframe.
    :param df: dataframe to be swapped
    :param features_to_swap: features to be swapped
    :param swap_range: swap range as a percentage of the dataframe length
    :return: swapped dataframe
    """
    # Select the swap range
    swap_range = int(swap_range * len(df))

    df_swapped = df.reset_index(drop=False)
    for feature in features_to_swap:

        df_swapped = df_swapped.sort_values(by=feature)
        df_swapped = df_swapped.reset_index(drop=True)

        # Swap the values of the feature
        for i in range(len(df)):
            swap_index = random.randint(max(0, i - swap_range), min(i + swap_range, len(df)-1))
            df_swapped.at[i, feature], df_swapped.at[swap_index, feature] = df_swapped.at[swap_index, feature], df_swapped.at[i, feature]

    df_swapped.set_index('Id', inplace=True, drop=True)
    df_swapped = df_swapped.sort_index()
    return df_swapped

In [4]:
def advanced_feature_eng(df):
    """
    Add new features to the dataframe related to the number of times certain ip/cards/emails have been used.
    :param df: dataframe
    :return: modified dataframe
    """
    # Check if the shopper's country of origin is the same as the issuer country
    df['countries_equal'] = (df['shoppercountrycode'] == df['issuercountrycode'])
    df.loc[df['countries_equal'] == False, 'countries_equal'] = 0
    df.loc[df['countries_equal'] == True, 'countries_equal'] = 1

    # Check if the shopper's country of origin is the same as the currency
    df['currency_equal'] = (df['shoppercountrycode'] == df['currencycode'])
    df.loc[df['currency_equal'] == False, 'currency_equal'] = 0
    df.loc[df['currency_equal'] == True, 'currency_equal'] = 1

    # Check with how many cards an email has been used
    df['card_email_count'] = df.groupby('mail_id')['card_id'].transform('count')

    # Check with how many cards an ip has been used
    df['card_ip_count'] = df.groupby('ip_id')['card_id'].transform('count')

    # Check with how many emails a card has been used
    df['email_card_count'] = df.groupby('card_id')['mail_id'].transform('count')

    # Check with how many ips a card has been used
    df['ip_card_count'] = df.groupby('card_id')['ip_id'].transform('count')

    # Check with how many emails an ip has been used
    df['email_ip_count'] = df.groupby('ip_id')['mail_id'].transform('count')

    # Check with how many ips an email has been used
    df['ip_email_count'] = df.groupby('mail_id')['ip_id'].transform('count')

    # Check how many times a card has been used
    df['card_count'] = df.groupby('card_id')['card_id'].transform('count')

    # Check how many times an email has been used
    df['email_count'] = df.groupby('mail_id')['mail_id'].transform('count')

    # Check how many times an ip has been used
    df['ip_count'] = df.groupby('ip_id')['ip_id'].transform('count')

    # Check how many times a bin has been used
    df['bin_count'] = df.groupby('bin')['bin'].transform('count')

    return df

### Preprocessing

In [5]:
import re
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin, BaseEstimator


def country_equal_feature_eng(df):
    # Check if the shopper's country of origin is the same as the issuer country
    df['countries_equal'] = (df['shoppercountrycode'] == df['issuercountrycode'])
    df.loc[df['countries_equal'] == False, 'countries_equal'] = 0
    df.loc[df['countries_equal'] == True, 'countries_equal'] = 1

    return df


class CustomDataTransformer(BaseEstimator, TransformerMixin):

    def __init__(
        self, currency_conv_func,
        feature_engineering_func=country_equal_feature_eng,
        card_enc=LabelEncoder(),
        ip_enc=LabelEncoder(),
        country_code_enc=LabelEncoder(),
        tx_variant_code_enc=LabelEncoder(),
        currency_code_enc=LabelEncoder(),
        shopper_interaction_enc=LabelEncoder(),
        account_code_enc=LabelEncoder(),
        card_verif_code_supplied=LabelEncoder(),
        mail_id_enc=LabelEncoder(),
        bin_encoder=LabelEncoder(),
        scaler=StandardScaler(),

        drop_private_columns=False,
        columns_to_scale=None,
        one_hot_columns=None

    ):
        '''

        :param currency_conv_func: currency conversion function
        :param feature_engineering_func: feature engineering function
        :param card_enc: card encoder
        :param ip_enc: ip encoder
        :param country_code_enc:  country code encoder
        :param tx_variant_code_enc: tx variant code encoder
        :param currency_code_enc:  currency code encoder
        :param shopper_interaction_enc:  shopper interaction encoder
        :param account_code_enc: account code encoder
        :param card_verif_code_supplied: card verification code supplied encoder
        :param mail_id_enc: mail id encoder
        :param scaler: scaler used
        :param drop_private_columns: drop columns that might be considered sensitive data (e.g. mail_id)
        :param columns_to_scale: columns that will be scaled using the scaler
        :param one_hot_columns: columns that will be one-hot encoded
        '''
        self.card_enc = card_enc
        self.ip_enc = ip_enc
        self.country_code_enc = country_code_enc
        self.tx_variant_code_enc = tx_variant_code_enc
        self.currency_code_enc = currency_code_enc
        self.shopper_interaction_enc = shopper_interaction_enc
        self.account_code_enc = account_code_enc
        self.bin_enc = bin_encoder
        self.card_verif_code_supplied = card_verif_code_supplied
        self.mail_id_enc = mail_id_enc

        self.currency_conv_func = currency_conv_func
        self.feature_eng_func = feature_engineering_func

        self.country_codes = None
        self.cleaned_data = None

        self.scaler = scaler
        if columns_to_scale is None:
            self.columns_to_scale = []
        else:
            self.columns_to_scale = columns_to_scale
        if one_hot_columns is None:
            self.one_hot_columns = []
        else:
            self.one_hot_columns = one_hot_columns

        self.drop_private_columns = drop_private_columns

    def _clean_data(self, df):
        # Cleaning up data inconsisstencies
        df.loc[df['cardverificationcodesupplied'].isna(), 'cardverificationcodesupplied'] = False
        # df.loc[df['mail_id'].str.contains('na', case=False), 'mail_id'] = 'email99999'
        df.loc[df['cvcresponsecode'] > 2, 'cvcresponsecode'] = 3

        df.loc[df['issuercountrycode'].isna(), 'issuercountrycode'] = '--'
        df.loc[df['shoppercountrycode'].isna(), 'shoppercountrycode'] = '--'
        unique_issuer_cc = df['issuercountrycode'].unique()
        unique_shopper_cc = df['shoppercountrycode'].unique()
        both = np.append(unique_issuer_cc, unique_shopper_cc)
        df_countrycodes = pd.DataFrame(both)
        unique_country_codes = df_countrycodes[0].unique()
        self.country_codes = unique_country_codes

        df['amount_eur'] = df.apply(lambda x: self.currency_conv_func(x), axis=1)
        df.drop("amount", axis=1, inplace=True)
        df['accountcode'] = df['accountcode'].apply(lambda x: re.sub('Account','',x))
        df.loc[(df['accountcode'] == 'UK'),'accountcode'] = 'GB'
        df.loc[(df['accountcode'] == 'Mexico'),'accountcode'] = 'MX'
        df.loc[(df['accountcode'] == 'Sweden'),'accountcode'] = 'SE'
        df.loc[(df['accountcode'] == 'APAC'),'accountcode'] = 'APAC'


        return df

    def fit(self, X, y=None):
        df = X.copy(deep=True)
        df = self._clean_data(df)

        # 1.Card ID
        self.card_enc.fit(df['card_id'])


        # 2.IP ID
        self.ip_enc.fit(df['ip_id'])

        # 3. Country code
        self.country_code_enc.fit(self.country_codes)

        # 4. TX variant code
        self.tx_variant_code_enc.fit(df['txvariantcode'])

        # 5. Currency code
        self.currency_code_enc.fit(df['currencycode'])

        # 6. Shopper Interaction
        self.shopper_interaction_enc.fit(df['shopperinteraction'])

        # 7. Account code
        self.account_code_enc.fit(df['accountcode'])
        # df['accountcode'] = self.account_code_enc.transform(df.accountcode)

        # 8. Card Verification Code Supplied
        self.card_verif_code_supplied.fit(df['cardverificationcodesupplied'])
        # df['cardverificationcodesupplied'] = self.card_verif_code_supplied.transform(
        #     df.cardverificationcodesupplied
        # )

        # 9. Email ID
        self.mail_id_enc.fit(df['mail_id'])

        # 10. Bin
        self.bin_enc.fit(df['bin'])
        # df['mail_id'] = self.mail_id_enc.transform(df.mail_id)
        # df.drop("mail_id", axis=1, inplace=True)
        #
        # # Fit the scaler
        # if self.scaler is not None:
        #     self.scaler.fit(df)

        return self

    def transform(self, X):
        df = X.copy(deep=True)
        df = self._clean_data(df)

        # Feature Engineering
        if self.feature_eng_func is not None:
            df = self.feature_eng_func(df)

        # 1.Card ID
        if self.drop_private_columns:
            df.drop("card_id", axis=1, inplace=True)
        else:
            df['card_id'] = self.card_enc.transform(df.card_id)

        # 2.IP ID
        # df['ip_id'] = self.ip_enc.transform(df.ip_id)
        if self.drop_private_columns:
            df.drop("ip_id", axis=1, inplace=True)
        else:
            df['ip_id'] = self.ip_enc.transform(df.ip_id)
        # 3. Country code
        if 'issuercountrycode' not in self.one_hot_columns:
            df['issuercountrycode'] = self.country_code_enc.transform(df.issuercountrycode)
        if 'shoppercountrycode' not in self.one_hot_columns:
            df['shoppercountrycode'] = self.country_code_enc.transform(df.shoppercountrycode)

        # 4. TX variant code
        if 'txvariantcode' not in self.one_hot_columns:
            df['txvariantcode'] = self.tx_variant_code_enc.transform(df.txvariantcode)

        # 5. Currency code
        if 'currencycode' not in self.one_hot_columns:
            df['currencycode'] = self.currency_code_enc.transform(df.currencycode)

        # 6. Shopper Interaction
        if 'shopperinteraction' not in self.one_hot_columns:
            df['shopperinteraction'] = self.shopper_interaction_enc.transform(
                df.shopperinteraction
            )

        # 7. Account code
        if 'accountcode' not in self.one_hot_columns:
            df['accountcode'] = self.account_code_enc.transform(df.accountcode)

        # 8. Card Verification Code Supplied
        if 'cardverificationcodesupplied' not in self.one_hot_columns:
            df['cardverificationcodesupplied'] = self.card_verif_code_supplied.transform(df['cardverificationcodesupplied'])

        # 9. Email ID
        if self.drop_private_columns:
            df.drop("mail_id", axis=1, inplace=True)
        else:
            df['mail_id'] = self.mail_id_enc.transform(df.mail_id)

        # 10. Bin
        if self.drop_private_columns:
            df.drop("bin", axis=1, inplace=True)
        else:
            df['bin'] = self.bin_enc.transform(df.bin)
        # df.drop("bin", axis=1, inplace=True)

        # Scale the data
        if self.scaler is not None and len(self.columns_to_scale) > 0:
            # Columns to scale
            scale_columns = np.array(df[self.columns_to_scale])
            if len(scale_columns.shape) == 1:
                scale_columns = scale_columns.reshape(-1, 1)
                df[self.columns_to_scale] = self.scaler.fit_transform(scale_columns).reshape(-1)
            df[self.columns_to_scale] = self.scaler.fit_transform(scale_columns)

        if len(self.one_hot_columns) > 0:
            df = self.one_hot_encode(df)
        return df

    def one_hot_encode(self, df):
        # One hot encode the selected columns
        cols_to_encode = df[self.one_hot_columns]
        encoder = OneHotEncoder(sparse=False)
        encoded_cols = encoder.fit_transform(cols_to_encode)
        encoded_cols_df = pd.DataFrame(
            encoded_cols, columns=encoder.get_feature_names(self.one_hot_columns), index=df.index
        )
        df.drop(self.one_hot_columns, axis=1, inplace=True)
        df = pd.concat([df, encoded_cols_df], axis=1)

        return df


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc


def train_classifier_and_plot(classifier, data_quad, cv_func=None):
    X_train_data, X_test_data, y_train_data, y_test_data = data_quad
    if cv_func is None:
        classifier.fit(X_train_data, y_train_data)
    else:
        classifier = cv_func(X, y, classifier,
                             scoring='accuracy',
                             k=10
        )

    y_predictions = classifier.predict(X_test_data)
    y_predictions_proba = classifier.predict_proba(X_test_data)[:, 1]

    print(classification_report(y_test_data, y_predictions))
    print(f"Accuracy: {accuracy_score(y_test_data, y_predictions) * 100:.2f}")

    fpr, tpr, _ = roc_curve(y_test_data, y_predictions_proba)
    roc_auc = auc(fpr, tpr)

    ConfusionMatrixDisplay.from_predictions(y_test_data, y_predictions)
    plt.show()
    return fpr, tpr, roc_auc, classifier


def plot_roc_curves(plot_title, *plot_tuples):
    plt.clf()
    for plot_tuple in plot_tuples:
        plt.plot(plot_tuple[0], plot_tuple[1], label=plot_tuple[2])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(plot_title)
    plt.legend()
    plt.show()

In [7]:
# Load the data inside pandas Dataframe
import os

train_data_path = os.path.join(os.getcwd(), "data", "train_data.csv")
test_data_path = os.path.join(os.getcwd(), "data", "test_data.csv")

train_data_df = pd.read_csv(train_data_path)
train_data_df.set_index('Id', inplace=True)
test_data_df = pd.read_csv(test_data_path)
test_data_df.set_index('Id', inplace=True)

# Connect the train and test dataframes
train_data_df['is_train'] = 1
test_data_df['is_train'] = 0
data_df = pd.concat([train_data_df, test_data_df], axis=0, sort=False)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\micha\\Documents\\Przegrane\\Delft\\masters\\CDA\\deliverable\\data\\train_data.csv'

### The code below may take a while to. You can reduce the number of the features to swap to speed up the algorithm.

In [None]:
from sklearn.model_selection import train_test_split

def conv(row, data_conv=date(2023, 1, 2)):
    c = CurrencyConverter()
    return c.convert(row['amount'], row['currencycode'], 'EUR', date=data_conv)

# Regular pipeline
pipeline = Pipeline([
    ('custom_transform', CustomDataTransformer(conv, feature_engineering_func=None,
                                               columns_to_scale=['issuercountrycode', 'txvariantcode', 'bin', 'currencycode',
       'shoppercountrycode', 'shopperinteraction',
       'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode',
       'label', 'amount_eur', 'countries_equal', 'currency_equal',
       'card_email_count', 'card_ip_count', 'email_card_count',
       'ip_card_count', 'email_ip_count', 'ip_email_count', 'card_count',
       'email_count', 'ip_count', 'bin_count'])),  # apply the custom function
])
pipeline = pipeline.fit(data_df)
# Fit the pipeline
transformed_data_df = pipeline.transform(data_df)
transformed_train_data_df = transformed_data_df[transformed_data_df['is_train'] == 1].copy()
transformed_train_data_df.drop(['is_train'], axis=1, inplace=True)
transformed_test_data_df = transformed_data_df[transformed_data_df['is_train'] == 0].copy()
transformed_test_data_df.drop(['is_train'], axis=1, inplace=True)
transformed_test_data_df.drop(['label'], axis=1, inplace=True)


# Perform swapping
df_swapped = rank_swap_custom(transformed_train_data_df.copy(), features_to_swap=["amount_eur", 'card_email_count', 'card_ip_count', 'email_card_count',
       'ip_card_count', 'email_ip_count', 'ip_email_count', 'card_count',
       'email_count', 'ip_count'], swap_range=0.05)
transformed_train_data_df.head()
X = transformed_train_data_df.drop(['label'], axis=1)
y = transformed_train_data_df['label']

train_test_pairs = train_test_split(X, y, test_size=0.2, random_state=42)

X_swapped = df_swapped.drop(['label'], axis=1)
y_swapped = df_swapped['label']

train_test_pairs_swapped = train_test_split(X_swapped, y_swapped, test_size=0.2, random_state=42)



### 3b. Analyse the performance of the classifiers. Explain which method performs best.
From the code below, it is clear that both swapped and unswapped data performs similarly. Classifiers that fail in classification (knn and logistic) perfrom similarly bad on both datasets. However, surprisingly Random Forest works better with unswapped data.

In [None]:
# Import knn classifier
from sklearn.neighbors import KNeighborsClassifier

# KNN
knn_classifier = KNeighborsClassifier(n_neighbors=10, weights='distance', n_jobs=-1)
knn_fpr, knn_tpr, knn_roc_auc, _ = train_classifier_and_plot(
    knn_classifier, train_test_pairs
)

In [None]:
# Import logistic regression
from sklearn.linear_model import LogisticRegression

# Logistic regression
logistic_regression_classifier = LogisticRegression(random_state=0)
logistic_regression_fpr, logistic_regression_tpr, logistic_regression_roc_auc, _ = train_classifier_and_plot(
    logistic_regression_classifier, train_test_pairs
)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random forest
random_forest_classifier = RandomForestClassifier(random_state=0, class_weight='balanced')
random_forest_fpr, random_forest_tpr, random_forest_roc_auc, _ = train_classifier_and_plot(
    random_forest_classifier, train_test_pairs
)

In [None]:
plot_roc_curves("ROC curves for different blackbox classifiers",
                (knn_fpr, knn_tpr, f"Gradient Boosted Trees (AUC = {knn_roc_auc:.4f})"),
                (logistic_regression_fpr, logistic_regression_tpr, f"AdaBoost (AUC = {logistic_regression_roc_auc:.4f})"),
                (random_forest_fpr, random_forest_tpr, f"Random forest (AUC = {random_forest_roc_auc:.4f})")
)

### Analyzing the performance of different classifiers on the swapped data

In [8]:
# KNN
knn_fpr, knn_tpr, knn_roc_auc, _ = train_classifier_and_plot(
    knn_classifier, train_test_pairs_swapped
)

NameError: name 'knn_classifier' is not defined

In [9]:
# Logistic regression
logistic_regression_fpr, logistic_regression_tpr, logistic_regression_roc_auc, _ = train_classifier_and_plot(
    logistic_regression_classifier, train_test_pairs_swapped
)


NameError: name 'logistic_regression_classifier' is not defined

In [None]:

# Random forest
random_forest_fpr, random_forest_tpr, random_forest_roc_auc, _ = train_classifier_and_plot(
random_forest_classifier, train_test_pairs_swapped)

In [None]:
# Plot roc curves
plot_roc_curves("ROC curves for different blackbox classifiers",
                (knn_fpr, knn_tpr, f"Gradient Boosted Trees (AUC = {knn_roc_auc:.4f})"),
                (logistic_regression_fpr, logistic_regression_tpr, f"AdaBoost (AUC = {logistic_regression_roc_auc:.4f})"),
                (random_forest_fpr, random_forest_tpr, f"Random forest (AUC = {random_forest_roc_auc:.4f})")
)

### Can you explain the performance difference for the different classifiers? Is it advisable to protect people’s privacy using rank-swapping? Why (not)?
As you can see, having performed the rank swapping, the differences in performance are only visible for the random forest classifier which surprisingly performs better with the swapped features. This might be due to less overfiting taking place. In the case of other classifiers, the results stay the same. Given these insignificant changes to the performance of the classifier, it is safe to say that rank swapping is a good way to protect people's privacy. However, if the data contains categorical data that cannot be ordered, rank swapping cannot be performed. In such cases, other methods such as adding noise to the data can be used to protect people's privacy.