In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns

def header(text):
    print(f"\033[94m\033[1m", text, f"\033[0m")

def num_nulls(df, perc=False) -> dict:
    nulls_dict = {}
    for col in df.columns:
        num_nulls = int(df[col].isnull().sum())
        if perc == True:
            nulls_dict[col] = (num_nulls / len(df[col])) * 100
        else:
            nulls_dict[col] = num_nulls
    return nulls_dict

def split_features(df, split_by):
    others = df.drop(columns=split_by)
    split = df.loc[:, split_by]
    return split, others

def extract_df(df):
    num_cols = list(df.select_dtypes('number').columns)
    cat_cols = [j for j in df.columns if j not in num_cols]
    numeric, cat = split_features(df, split_by=num_cols)
    return numeric, cat

def preprocess_data(
        df,
        target:str,
        remove_nulls_threshold:int,
        remove_nulls=False,
        interpolate=False,
        standardise=False,
        ) -> pd.DataFrame:
    
    # split independent & target:
    y, Xs = split_features(df, target)
    
    # number of independent features:
    k = len(Xs.columns)
    
    # work on Y feature:
    if y.isnull().sum() > 0:
        y.interpolate(limit_area='inside')
        print('Target Feature has missing values being interpolated.')
    elif y.unique().sum() != 1:
        print('Target Feature is not binary.')
    else:
        y = pd.to_numeric(y)

    # remove any columns with constant values
    Xs = Xs[Xs.columns[Xs.nunique() > 1]]
    header('Number of Columns with constant values dropped:') 
    print(f'{k - len(Xs.columns)}')

    # replace any non-NaN missing values with NaN
    Xs = Xs.replace([float('inf'), float('-inf'), None, 'NULL', 'Null', 'null'], float('nan'))
    
    if interpolate is True:
        Xs = Xs.interpolate(limit_area='inside')
        header(f'Features with missing values interpolated.')
        
    # remove any columns that have > x null values
    if remove_nulls is True:
        null_dict = num_nulls(Xs, perc=True)
        nullcols = [i for i in null_dict.keys() if null_dict[i] >= remove_nulls_threshold]
        Xs = Xs.loc[:, [c for c in Xs.columns if c not in nullcols]]
        header('Number of Columns with > 50% NaN droped:')
        print(f'{len(nullcols)}')

    # standardise numerical columns
    if standardise is True:
        z = StandardScaler()
        num, cat = extract_df(Xs)
        for j in num.columns:
            if j == 'TransactionID':
                continue
            else:
                Xs[j] = z.fit_transform(Xs[j].to_numpy().reshape(-1, 1))

    # encode any bool columns and any strings labelled T/F to 1/0
    Xs[Xs.columns[Xs.dtypes == 'bool']] += 0
    for x in Xs.columns:
        if ('T' in set(Xs[x])) or ('F' in set(Xs[x])):
            Xs[x] = Xs[x].replace('T', 1).replace('F', 0)

    # rejoin Xs and Y features
    newdf = pd.concat([Xs, y], axis=1)

    return newdf



In [None]:
transaction_train = pd.read_csv('train_transaction.csv')
cleaned_transaction_train = preprocess_data(df=transaction_train,
                                            target='isFraud',
                                            remove_nulls=True,
                                            remove_nulls_threshold=50,
                                            interpolate=False,
                                            standardise=True     
                                    )

display(cleaned_transaction_train)