In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
from pycaret.classification import *
import mlflow

In [2]:
# Load the data
df = pd.read_csv("data/fraudTrain.csv")

# Drop rows with missing values in the 'is_fraud' column
df = df.dropna(subset=['is_fraud'])

# Convert 'is_fraud' to categorical and then to integer type
df['is_fraud'] = df['is_fraud'].astype('category').cat.codes

print(f"data shape : {df.shape}")

df.head()

data shape : (344517, 22)


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0


In [3]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import pygeohash as gh


def feature_engineering(df: pd.DataFrame):

    # Split the '**trans_date_trans_time' column into two new columns
    df[['trans_date', 'trans_time']] = df['trans_date_trans_time'].str.split(' ', expand=True)
    
    # Convert trans_date to datetime
    df['trans_date'] = pd.to_datetime(df['trans_date'])
    
    # Format trans_date as 'YYYY-MM-DD'
    df['trans_date'] = df['trans_date'].dt.strftime('%Y-%m-%d')

    date_columns = ['trans_date', 'dob']
    df[date_columns] = df[date_columns].apply(pd.to_datetime)
  

    # Haversine distance between customer and merchant
    df['customer_merchant_distance'] = df.apply(lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).miles, axis=1)

    # drop latitude and longitude 
    df = df.drop(labels = ['lat', 'long', 'merch_lat', 'merch_long', 'trans_date_trans_time'], axis = 1)

    # rearranging the columns
    df = df[df.columns.drop('is_fraud').tolist() + ['is_fraud']]

    return df

# Assuming your DataFrame is named 'df'
df = feature_engineering(df)

df.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,city_pop,job,dob,trans_num,unix_time,trans_date,trans_time,customer_merchant_distance,is_fraud
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,2019-01-01,00:00:18,48.947783,0
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,2019-01-01,00:00:44,18.775736,0
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,2019-01-01,00:00:51,67.172035,0
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,2019-01-01,00:01:16,59.455974,0
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,2019-01-01,00:03:06,48.28203,0


In [10]:
date_columns = ['trans_date', 'dob']
df[date_columns] = df[date_columns].apply(pd.to_datetime)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 344517 entries, 0 to 344516
Data columns (total 20 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   cc_num                      344517 non-null  int64         
 1   merchant                    344517 non-null  object        
 2   category                    344517 non-null  object        
 3   amt                         344517 non-null  float64       
 4   first                       344517 non-null  object        
 5   last                        344517 non-null  object        
 6   gender                      344517 non-null  object        
 7   street                      344517 non-null  object        
 8   city                        344517 non-null  object        
 9   state                       344517 non-null  object        
 10  zip                         344517 non-null  int64         
 11  city_pop                    344517 non-null 

In [4]:
# Get all unique values for categorical columns
unique_values = {}
for column in df.select_dtypes(include=['object', 'category']):
    unique_values[column] = df[column].nunique()

print(unique_values)

{'merchant': 693, 'category': 14, 'first': 341, 'last': 472, 'gender': 2, 'street': 936, 'city': 859, 'state': 51, 'job': 481, 'dob': 922, 'trans_num': 344517, 'trans_date': 161, 'trans_time': 84170}


In [5]:
[ i for i in df.columns if i not in list(unique_values.keys()) ]

['cc_num',
 'amt',
 'zip',
 'city_pop',
 'unix_time',
 'customer_merchant_distance',
 'is_fraud']

In [6]:
mlflow.set_experiment("mlops-pipeline")

<Experiment: artifact_location='file:///home/ubuntu/projects/credit-card-fraud/mlruns/318242459975068357', creation_time=1719917263668, experiment_id='318242459975068357', last_update_time=1719917263668, lifecycle_stage='active', name='mlops-pipeline', tags={}>

In [12]:
data = setup(
            data = df,
            target = 'is_fraud',
            ignore_features = ['first', 'last', 'street', 'merch_lat', 'merch_long', 'unix_time', 	'cc_num', 'trans_num', 'zip', 'trans_time'	 ],
            date_features = ['trans_date', 'dob'],
            categorical_features = ['merchant','city', 'state', 'job' ],
            numeric_features = ['amt', 'customer_merchant_distance'],
            ordinal_features = {
                'category' : ['misc_net', 'grocery_pos', 'entertainment', 'gas_transport','misc_pos', 'grocery_net', 'shopping_net', 'shopping_pos','food_dining', 'personal_care', 'health_fitness', 'travel','kids_pets', 'home'],
                'gender' : ['F', 'M'],
            },
            train_size = 0.9,
            preprocess = True,
            imputation_type = "iterative",
            iterative_imputation_iters = 10,
            numeric_iterative_imputer = "lightgbm",
            categorical_iterative_imputer = "lightgbm",
            remove_outliers = True, 
            feature_selection = True,
            normalize = True,
            fix_imbalance = True,
            remove_multicollinearity = True, 
            verbose=False, 
            log_experiment = True,
            experiment_name = 'mlops-pipeline',
            log_data = True  
        )

[LightGBM] [Info] Number of positive: 293751, number of negative: 293751
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.086334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3567
[LightGBM] [Info] Number of data points in the train set: 587502, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [14]:
data

<pycaret.classification.oop.ClassificationExperiment at 0x7afa3b4cb460>