In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000) 

In [3]:
df = pd.read_csv('fraudTrain.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
# Ensure trans_date_trans_time is a datetime object
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Extract the year of the transaction and the year of birth
df['transaction_year'] = df['trans_date_trans_time'].dt.year
df['year_of_birth'] = pd.to_datetime(df['dob']).dt.year

# Calculate age
df['age'] = df['transaction_year'] - df['year_of_birth']

# Drop the intermediate columns if not needed
df.drop(columns=['dob', 'transaction_year', 'year_of_birth'], inplace=True)

# Preview the updated dfset
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,trans_num,unix_time,merch_lat,merch_long,is_fraud,age
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,31
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,41
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,57
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,52
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,33


In [6]:
irrelevant_columns = ['Unnamed: 0', 'cc_num', 'trans_num', 'street']
df_cleaned = df.drop(columns=irrelevant_columns)

In [7]:
df_cleaned.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,age
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0,31
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0,41
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0,57
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1325376076,47.034331,-112.561071,0,52
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,38.674999,-78.632459,0,33


In [8]:
# Check for missing values
print(df_cleaned.isnull().sum())

# Fill missing values
df_cleaned.fillna(0, inplace=True)

trans_date_trans_time    0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
age                      0
dtype: int64


In [9]:
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2, lon2):
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers. Use 3956 for miles
    return c * r

# Apply the Haversine function to each row
df_cleaned['distance'] = df_cleaned.apply(lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

# Preview the updated data with the distance column
df_cleaned[['lat', 'long', 'merch_lat', 'merch_long', 'distance']].head()

Unnamed: 0,lat,long,merch_lat,merch_long,distance
0,36.0788,-81.1781,36.011293,-82.048315,78.597568
1,48.8878,-118.2105,49.159047,-118.186462,30.212176
2,42.1808,-112.262,43.150704,-112.154481,108.206083
3,46.2306,-112.1138,47.034331,-112.561071,95.673231
4,38.4207,-79.4629,38.674999,-78.632459,77.556744


In [10]:
# Number of bins
n_bins = 10

# Create bins for each column
df_cleaned['lat_bucket'] = pd.cut(df_cleaned['lat'], bins=n_bins, labels=False)
df_cleaned['long_bucket'] = pd.cut(df_cleaned['long'], bins=n_bins, labels=False)
df_cleaned['merch_lat_bucket'] = pd.cut(df_cleaned['merch_lat'], bins=n_bins, labels=False)
df_cleaned['merch_long_bucket'] = pd.cut(df_cleaned['merch_long'], bins=n_bins, labels=False)

# Display the updated dataset
df_cleaned[['lat', 'lat_bucket', 'long', 'long_bucket', 'merch_lat', 'merch_lat_bucket', 'merch_long', 'merch_long_bucket']].head()

Unnamed: 0,lat,lat_bucket,long,long_bucket,merch_lat,merch_lat_bucket,merch_long,merch_long_bucket
0,36.0788,3,-81.1781,8,36.011293,3,-82.048315,8
1,48.8878,6,-118.2105,4,49.159047,6,-118.186462,4
2,42.1808,4,-112.262,5,43.150704,4,-112.154481,5
3,46.2306,5,-112.1138,5,47.034331,5,-112.561071,5
4,38.4207,3,-79.4629,8,38.674999,4,-78.632459,8


In [11]:
df_cleaned.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,age,distance,lat_bucket,long_bucket,merch_lat_bucket,merch_long_bucket
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0,31,78.597568,3,8,3,8
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0,41,30.212176,6,4,6,4
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0,57,108.206083,4,5,4,5
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1325376076,47.034331,-112.561071,0,52,95.673231,5,5,5,5
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,38.674999,-78.632459,0,33,77.556744,3,8,4,8


In [12]:
categorical_columns = ['merchant', 'category', 'gender', 'job']
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

In [13]:
df_cleaned.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,age,distance,lat_bucket,long_bucket,merch_lat_bucket,merch_long_bucket
0,2019-01-01 00:00:18,514,8,4.97,Jennifer,Banks,0,Moravian Falls,NC,28654,36.0788,-81.1781,3495,370,1325376018,36.011293,-82.048315,0,31,78.597568,3,8,3,8
1,2019-01-01 00:00:44,241,4,107.23,Stephanie,Gill,0,Orient,WA,99160,48.8878,-118.2105,149,428,1325376044,49.159047,-118.186462,0,41,30.212176,6,4,6,4
2,2019-01-01 00:00:51,390,0,220.11,Edward,Sanchez,1,Malad City,ID,83252,42.1808,-112.262,4154,307,1325376051,43.150704,-112.154481,0,57,108.206083,4,5,4,5
3,2019-01-01 00:01:16,360,2,45.0,Jeremy,White,1,Boulder,MT,59632,46.2306,-112.1138,1939,328,1325376076,47.034331,-112.561071,0,52,95.673231,5,5,5,5
4,2019-01-01 00:03:06,297,9,41.96,Tyler,Garcia,1,Doe Hill,VA,24433,38.4207,-79.4629,99,116,1325376186,38.674999,-78.632459,0,33,77.556744,3,8,4,8


In [14]:
df_cleaned.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,age,distance,lat_bucket,long_bucket,merch_lat_bucket,merch_long_bucket
0,2019-01-01 00:00:18,514,8,4.97,Jennifer,Banks,0,Moravian Falls,NC,28654,36.0788,-81.1781,3495,370,1325376018,36.011293,-82.048315,0,31,78.597568,3,8,3,8
1,2019-01-01 00:00:44,241,4,107.23,Stephanie,Gill,0,Orient,WA,99160,48.8878,-118.2105,149,428,1325376044,49.159047,-118.186462,0,41,30.212176,6,4,6,4
2,2019-01-01 00:00:51,390,0,220.11,Edward,Sanchez,1,Malad City,ID,83252,42.1808,-112.262,4154,307,1325376051,43.150704,-112.154481,0,57,108.206083,4,5,4,5
3,2019-01-01 00:01:16,360,2,45.0,Jeremy,White,1,Boulder,MT,59632,46.2306,-112.1138,1939,328,1325376076,47.034331,-112.561071,0,52,95.673231,5,5,5,5
4,2019-01-01 00:03:06,297,9,41.96,Tyler,Garcia,1,Doe Hill,VA,24433,38.4207,-79.4629,99,116,1325376186,38.674999,-78.632459,0,33,77.556744,3,8,4,8


In [15]:
columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'city', 'state', 'zip', 'lat', 'long', 'merch_lat', 'merch_long']
df_cleaned = df_cleaned.drop(columns=columns_to_drop)

In [16]:
df_cleaned.head()

Unnamed: 0,merchant,category,amt,gender,city_pop,job,unix_time,is_fraud,age,distance,lat_bucket,long_bucket,merch_lat_bucket,merch_long_bucket
0,514,8,4.97,0,3495,370,1325376018,0,31,78.597568,3,8,3,8
1,241,4,107.23,0,149,428,1325376044,0,41,30.212176,6,4,6,4
2,390,0,220.11,1,4154,307,1325376051,0,57,108.206083,4,5,4,5
3,360,2,45.0,1,1939,328,1325376076,0,52,95.673231,5,5,5,5
4,297,9,41.96,1,99,116,1325376186,0,33,77.556744,3,8,4,8
