In [18]:
import numpy as np
import pandas as pd
import plotly.express as px
from pathlib import Path
from datetime import datetime as dt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn import tree
import pydotplus
from IPython.display import Image


In [19]:
## function to find the distance in kilometers between two geo-spatial co-ordinates.  Based on excample from Scikit Learn documentation
def get_distance(from_lat, from_lng, to_lat, to_lng):
    from sklearn.metrics.pairwise import haversine_distances
    from math import radians
    start =[]
    end =[]
    start.append(from_lat)
    start.append(to_lat)
    end.append(to_lat)
    end.append(to_lng)

    # bsas = [-34.83333, -58.5166646]
    # paris = [49.0083899664, 2.53844117956]
    start_in_radians = [radians(_) for _ in start]
    end_in_radians = [radians(_) for _ in end]
    # bsas_in_radians = [radians(_) for _ in bsas]
    # paris_in_radians = [radians(_) for _ in paris]
    result = haversine_distances([start_in_radians, end_in_radians])
    result * 6371000/1000  # multiply by Earth radius to get kilometers
    # array([[    0.        , 11099.54035582],
    # [11099.54035582,     0.        ]])
    return result
    

In [20]:
## function to assign the U.S. Bureau of Economic Analysis region to a state.
def get_region(state):
    
    states_to_bea_regions = {
        "AL": "Southeast",
        "AK": "Far West",
        "AZ": "Southwest",
        "AR": "Southeast",
        "CA": "Far West",
        "CO": "Rocky Mountain",
        "CT": "New England",
        "DE": "Mideast",
        "DC": "Mideast",
        "FL": "Southeast",
        "GA": "Southeast",
        "HI": "Far West",
        "ID": "Rocky Mountain",
        "IL": "Great Lakes",
        "IN": "Great Lakes",
        "IA": "Plains",
        "KS": "Plains",
        "KY": "Southeast",
        "LA": "Southeast",
        "ME": "New England",
        "MD": "Mideast",
        "MA": "New England",
        "MI": "Great Lakes",
        "MN": "Great Lakes",
        "MS": "Southeast",
        "MO": "Great Lakes",
        "MT": "Rocky Mountain",
        "NE": "Plains",
        "NV": "Southwest",
        "NH": "New England",
        "NJ": "Mideast",
        "NM": "Southwest",
        "NY": "Mideast",
        "NC": "Southeast",
        "ND": "Plains",
        "OH": "Great Lakes",
        "OK": "Southwest",
        "OR": "Far West",
        "PA": "Mideast",
        "RI": "New England",
        "SC": "Southeast",
        "SD": "Plains",
        "TN": "Southeast",
        "TX": "Southwest",
        "UT": "Rocky Mountain",
        "VT": "New England",
        "VA": "Mideast",
        "WA": "Far West",
        "WV": "Southeast",
        "WI": "Great Lakes",
        "WY": "Rocky Mountain"
    }
    region = states_to_bea_regions.get(state)
    return region



This dataset offers a variety of attributes valuable for comprehensive analysis. It contains 555,719 instances and 22 attributes, a mix of categorical and numerical data types. Importantly, the dataset is complete with no null values. Here's a breakdown of the attributes:
1. 
Trans_date_trans_time: Timestamp of the transaction (date and time  
2. .
Cc_n um:Unique customer identification numb  
3. r.
Merc: hantThe merchant involved in the transact  
4. on.
Cat:egory Transaction type (e.g., personal, childc  
5. re). 
Amt:Transaction   
6. ount.
First: Cardholder's firs  
7.  nam:e.
Last Cardholder's la  
8. t name.
Gender: Cardholder'  
9.  gender.
Street: Cardholder's stree  
10.  address.
City: Cardholder's city of  
11. residence.
State: Cardholder's state o  
12.  residence.
Zip: Cardholde  
13. 's z ip code.
Lat:Latitude of cardhold  
14. r's location.
Long: Longitude of cardhol  
15. er's loca tion.
City_pop:Population of the ca  
16. dhol der's city.
Job:Cardh.  
17. der's job title.
Dob: Cardholde  
18. 's date of birth.
Trans_num: Unique tran  
19. action identifier.
Unix_time: Transaction time  
20. tamp (Unix  format).
Merch_lat:Merchant's  
21. location (latitude).
Merch_long: Merchant's  
22. location  (longitude).
Is_fraud:Fraudulent transaction indicator (1 = f**raud, 0 = legitimate). This is the target variable for c**lassification purposes.

In [21]:
fraud_test_df = pd.read_csv('resources/fraud test.csv')

In [22]:
fraud_test_df.describe(include='all')

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719,555719.0,555719,555719,555719.0,555719,555719,555719,555719,...,555719.0,555719.0,555719.0,555719,555719,555719,555719.0,555719.0,555719.0,555719.0
unique,,226976,,693,14,,341,471,2,924,...,,,,478,910,555719,,,,
top,,15/12/2020 21:26,,fraud_Kilback LLC,gas_transport,,Christopher,Smith,F,444 Robert Mews,...,,,,Film/video editor,23/03/1977,2da90c7d74bd46a0caf3777415b3ebd3,,,,
freq,,16,,1859,56370,,11443,12146,304886,1474,...,,,,4119,2408,1,,,,
mean,277859.0,,4.178387e+17,,,69.39281,,,,,...,38.543253,-90.231325,88221.89,,,,1380679000.0,38.542798,-90.23138,0.00386
std,160422.401459,,1.309837e+18,,,156.745941,,,,,...,5.061336,13.72178,300390.9,,,,5201104.0,5.095829,13.733071,0.062008
min,0.0,,60416210000.0,,,1.0,,,,,...,20.0271,-165.6723,23.0,,,,1371817000.0,19.027422,-166.671575,0.0
25%,138929.5,,180043000000000.0,,,9.63,,,,,...,34.6689,-96.798,741.0,,,,1376029000.0,34.755302,-96.905129,0.0
50%,277859.0,,3521420000000000.0,,,47.29,,,,,...,39.3716,-87.4769,2408.0,,,,1380762000.0,39.376593,-87.445204,0.0
75%,416788.5,,4635330000000000.0,,,83.01,,,,,...,41.8948,-80.1752,19685.0,,,,1385867000.0,41.954163,-80.264637,0.0


In [23]:
fraud_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  float64
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

# Preprocessing and Feature Engineering

In [24]:
fraud_test_df.rename(columns={'Unnamed: 0':'ID'}, inplace=True)
fraud_test_df.set_index('ID', inplace=True, drop=True, verify_integrity ='True')

In [25]:
#creating cardholder age feature
fraud_test_df['date_dob'] = pd.to_datetime(fraud_test_df['dob'], format='%d/%m/%Y')
fraud_test_df['dt_trans_date_time'] = pd.to_datetime(fraud_test_df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
fraud_test_df['age_years'] = (dt.today()- fraud_test_df['date_dob'])/pd.Timedelta('365 days')
fraud_test_df.head()

Unnamed: 0_level_0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,date_dob,dt_trans_date_time,age_years
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,21/06/2020 12:14,2291160000000000.0,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,Mechanical engineer,19/03/1968,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0,1968-03-19,2020-06-21 12:14:00,56.114713
1,21/06/2020 12:14,3573030000000000.0,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,"Sales professional, IT",17/01/1990,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0,1990-01-17,2020-06-21 12:14:00,34.268137
2,21/06/2020 12:14,3598220000000000.0,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,"Librarian, public",21/10/1970,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0,1970-10-21,2020-06-21 12:14:00,53.522932
3,21/06/2020 12:15,3591920000000000.0,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,Set designer,25/07/1987,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0,1987-07-25,2020-06-21 12:15:00,36.753069
4,21/06/2020 12:15,3526830000000000.0,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,Furniture designer,06/07/1955,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0,1955-07-06,2020-06-21 12:15:00,68.827042


In [30]:
fraud_test_df[''].value_counts()

904

In [10]:
#creating distance between cardholder and merchant feature and economic region feature from the state abbreviation.
for index, row in fraud_test_df.iterrows():
    # print(row["firstname"])
    fraud_test_df.at[index, 'distance_km'] = get_distance(row['lat'], row['long'], row['merch_lat'], row['merch_long'])[0][1]
    fraud_test_df.at[index, 'region'] = get_region(row['state'])

In [11]:
# creating state regions based on Bureau of Economic Regions:


In [12]:
fraud_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 555719 entries, 0 to 555718
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   trans_date_trans_time  555719 non-null  object        
 1   cc_num                 555719 non-null  float64       
 2   merchant               555719 non-null  object        
 3   category               555719 non-null  object        
 4   amt                    555719 non-null  float64       
 5   first                  555719 non-null  object        
 6   last                   555719 non-null  object        
 7   gender                 555719 non-null  object        
 8   street                 555719 non-null  object        
 9   city                   555719 non-null  object        
 10  state                  555719 non-null  object        
 11  zip                    555719 non-null  int64         
 12  lat                    555719 non-null  float64  

In [None]:
fraud

## Feature Encoding
1. get_dummies: Gender  
2. one_hot_encoder: Categories
3. Target_encoder:  city, jobs
4. binning: states



In [13]:
fraud_test_df.to_csv('G:\My Drive\Boot Camp\Project_4\\fraud_test_extended.csv', sep =',')

In [None]:
fraud_extended = fraud_test_df.copy()

In [14]:
fraud_test_df['region'].value_counts()

region
Southeast         134330
Great Lakes       113327
Mideast           105947
Southwest          65835
Plains             43666
Far West           41995
Rocky Mountain     26540
New England        24079
Name: count, dtype: int64

## Splitting Training and Testing data
y = is_fraud  

Dropped from X features:
1. is_fraud
2. cc_num
3. first
4. last
5. street
6. dob
7. trans_date_trans_time
8. lat
9. long
10. merch_lat
11. merch_long
12. city
13. trans_num

In [None]:
y = fraud_extended['is_fraud']
X = fraud_extended.copy()
X.drop(['is_fraud', 'cc_num', 'first', 'last', 'street', 'dob', 'trans_date_trans_time', 'lat', 'long', 'merch_lat', 'merch_long', 'city', 'trans_num', 'unix_time'], axis=1, inplace=True)
X.info()

In [None]:
fraud_extended['state'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)