In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# https://www.kaggle.com/datasets/kartik2112/fraud-detection
file_url = 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/fraud.csv'
df = pd.read_csv(file_url)

In [None]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [None]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

* trans_date_trans_time : 거래 시간
* cc_num : 카드 번호. 고윳값이기 때문에 여기서는 id처럼 활용할 수 있음
* merchant : 거래 상점
* category : 거래 상점의 범주(애완용품, 여행, 엔터테인먼트 등)
* amt: 거래금액 (amount)
* first / last : 이름
* gender : 성별
* street / state / zip : 고객 거주지 정보
* lat / long : 고객주소에 대한 위도 및 경도
* city_pop : 고객의 zipcode 속하는 인구 수
* job : 직업
* dob : 생년월일
* trans_num : 거래번호
* unix_time : 거래시간 (유닉스 타임스탬프)
* merch_lat / merch_long : 상점의 위경도
---
* is_fraud : 사기거래 여부 (이상거래 여부) -> 종속변수

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  object 
 1   cc_num                 int64  
 2   merchant               object 
 3   category               object 
 4   amt                    float64
 5   first                  object 
 6   last                   object 
 7   gender                 object 
 8   street                 object 
 9   city                   object 
 10  state                  object 
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    object 
 16  dob                    object 
 17  trans_num              object 
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
dtypes: float64(5), int

In [None]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1852394 non-null  object 
 1   cc_num                 1852394 non-null  int64  
 2   merchant               1852394 non-null  object 
 3   category               1852394 non-null  object 
 4   amt                    1852394 non-null  float64
 5   first                  1852394 non-null  object 
 6   last                   1852394 non-null  object 
 7   gender                 1852394 non-null  object 
 8   street                 1852394 non-null  object 
 9   city                   1852394 non-null  object 
 10  state                  1852394 non-null  object 
 11  zip                    1852394 non-null  int64  
 12  lat                    1852394 non-null  float64
 13  long                   1852394 non-null  float64
 14  city_pop          

In [None]:
pd.options.display.float_format = '{:.2f}'.format
df.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0
mean,4.173860383937104e+17,70.06,48813.26,38.54,-90.23,88643.67,1358674218.83,38.54,-90.23,0.01
std,1.3091152653187348e+18,159.25,26881.85,5.07,13.75,301487.62,18195081.39,5.11,13.76,0.07
min,60416207185.0,1.0,1257.0,20.03,-165.67,23.0,1325376018.0,19.03,-166.67,0.0
25%,180042946491150.0,9.64,26237.0,34.67,-96.8,741.0,1343016823.75,34.74,-96.9,0.0
50%,3521417320836166.0,47.45,48174.0,39.35,-87.48,2443.0,1357089331.0,39.37,-87.44,0.0
75%,4642255475285942.0,83.1,72042.0,41.94,-80.16,20328.0,1374581485.25,41.96,-80.25,0.0
max,4.992346398065154e+18,28948.9,99921.0,66.69,-67.95,2906700.0,1388534374.0,67.51,-66.95,1.0


## 전처리

In [None]:
# 사용되지 않는 변수 제거
df.drop(['first', 'last', # 이름
         'street', 'city', 'state', 'zip', # 주소
         'trans_num', 'unix_time', # 거래번호 / 유닉스타임 (중복)
         'job', 'merchant' # 직업, 가게
         ], axis=1, inplace=True)

In [None]:
# 날짜 형태의 데이터 Object 문자열로 저장되어있다
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 12 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   category               object        
 3   amt                    float64       
 4   gender                 object        
 5   lat                    float64       
 6   long                   float64       
 7   city_pop               int64         
 8   dob                    object        
 9   merch_lat              float64       
 10  merch_long             float64       
 11  is_fraud               int64         
dtypes: datetime64[ns](1), float64(5), int64(3), object(3)
memory usage: 169.6+ MB


### 피처 엔지니어링
* 원래 고객의 거래패턴에서 벗어나는 거래

In [None]:
#@title 결제 금액
# Z 점수 (정규분포) <- (x - 평균) / 표준편차
# amt_info = df.groupby('cc_num').agg(['mean', 'std'])
# cc_num : 카드번호별 그룹을 묶어서, agg -> 여러 그룹함수(여러개의 값들을 통해 계산하는 통계값)
# mean = 평균 / std = 표준편차 => amt(결제금액) => cc_num
amt_info = df.groupby('cc_num').agg(['mean', 'std']).amt.reset_index()
amt_info.head()

Unnamed: 0,cc_num,mean,std
0,60416207185,59.26,142.87
1,60422928733,65.48,92.04
2,60423098130,96.38,1000.69
3,60427851591,107.49,131.01
4,60487002085,64.1,153.21


In [None]:
# A.merge(B, on = KEY, how=WAY)
# A라는 데이터프레임에 B를 합쳐주겠다 => (index?) => cc_num
# LEFT
df = df.merge(amt_info, on = 'cc_num', how='left')

In [None]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'category', 'amt', 'gender', 'lat',
       'long', 'city_pop', 'dob', 'merch_lat', 'merch_long', 'is_fraud',
       'mean', 'std'],
      dtype='object')

In [None]:
df[['cc_num', 'amt', 'mean', 'std']].head()

Unnamed: 0,cc_num,amt,mean,std
0,2703186189652095,4.97,89.41,127.53
1,630423337322,107.23,56.08,159.2
2,38859492057661,220.11,69.92,116.69
3,3534093764340240,45.0,80.09,280.08
4,375534208663984,41.96,95.34,94.32


In [None]:
# (x - 평균)/표준편차
df['amt_z_score'] = (df['amt'] - df['mean']) / df['std']

In [None]:
df['amt_z_score'].head()

0   -0.66
1    0.32
2    1.29
3   -0.13
4   -0.57
Name: amt_z_score, dtype: float64

In [None]:
df.drop(['mean', 'std'], axis=1, inplace=True)

In [None]:
#@title 범주별 결제금액
# 결제를 한 사람의 카드번호 / 결제가 일어난 상점의 종류(분류)
# agg -> 그룹을 대상으로 통계값. 그룹함수
# mean : 평균 / std : 표준편차
category_info = df.groupby(['cc_num', 'category']).agg(['mean', 'std'])['amt'].reset_index()

In [None]:
category_info.head()

Unnamed: 0,cc_num,category,mean,std
0,60416207185,entertainment,51.84,65.49
1,60416207185,food_dining,26.74,46.38
2,60416207185,gas_transport,59.78,15.76
3,60416207185,grocery_net,52.15,17.69
4,60416207185,grocery_pos,101.56,21.89


In [None]:
# cc_num, category -> merge. / mean, std => z_score / mean, std? drop
df = df.merge(category_info, on=['cc_num', 'category'], how='left')
df[['cc_num', 'category', 'amt', 'mean', 'std']].head()

Unnamed: 0,cc_num,category,amt,mean,std
0,2703186189652095,misc_net,4.97,84.86,116.07
1,630423337322,grocery_pos,107.23,99.64,23.9
2,38859492057661,entertainment,220.11,46.65,60.39
3,3534093764340240,gas_transport,45.0,61.54,15.75
4,375534208663984,misc_pos,41.96,35.48,4.93


In [None]:
df['cat_z_score'] = (df['amt'] - df['mean']) / df['std']

In [None]:
df['cat_z_score'].head()

0   -0.69
1    0.32
2    2.87
3   -1.05
4    1.31
Name: cat_z_score, dtype: float64

In [None]:
df.drop(['mean', 'std'], axis=1, inplace=True)

In [None]:
import geopy.distance

In [None]:
# 두 지점 간의 거리 (위경도)
# geopy.distance.distance((lat1, lng1), (lat2, lng2))

In [None]:
# coordinate 좌표 (위.경도)
# 1. 상점의 위치 (merchant)
# 2. 고객의 위치 (customer)
df['merch_coord'] = pd.Series(zip(df.merch_lat, df.merch_long))
df['cust_coord'] = pd.Series(zip(df['lat'], df['long']))

In [None]:
df['distance'] = df.apply(lambda x: geopy.distance.distance(x['merch_coord'], x['cust_coord']).km, axis=1)

In [None]:
df2 = df.copy()

In [None]:
df2.distance

0          78.77
1          30.22
2         108.10
3          95.69
4          77.70
           ...  
1852389    77.03
1852390   100.02
1852391    80.89
1852392    53.06
1852393    72.38
Name: distance, Length: 1852394, dtype: float64

In [None]:
# .km 빼먹으면 -> 단위를 포함해서 object -> object => float
# df['distance'] = df['distance'].str.split(expand=True)[0].astype('float64')

In [None]:
distance_info = df.groupby('cc_num').agg(['mean', 'std'])['distance'].reset_index()

In [None]:
distance_info

Unnamed: 0,cc_num,mean,std
0,60416207185,73.53,28.70
1,60422928733,78.99,29.30
2,60423098130,77.83,28.19
3,60427851591,75.71,28.98
4,60487002085,79.44,28.77
...,...,...,...
994,4958589671582726883,76.74,29.12
995,4973530368125489546,78.75,29.45
996,4980323467523543940,72.27,28.28
997,4989847570577635369,76.97,29.39


In [None]:
df = df.merge(distance_info, on = 'cc_num', how='left')
df['distance_z_score'] = (df['distance'] - df['mean']) / df['std']

In [None]:
df['distance_z_score']

0          0.03
1         -1.48
2          1.16
3          0.82
4          0.06
           ... 
1852389    0.05
1852390    0.69
1852391    0.29
1852392   -0.73
1852393   -0.16
Name: distance_z_score, Length: 1852394, dtype: float64

In [None]:
df.drop(['mean', 'std'], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud,amt_z_score,cat_z_score,merch_coord,cust_coord,distance,distance_z_score
0,2019-01-01 00:00:18,2703186189652095,misc_net,4.97,F,36.08,-81.18,3495,1988-03-09,36.01,-82.05,0,-0.66,-0.69,"(36.011293, -82.048315)","(36.0788, -81.1781)",78.77,0.03
1,2019-01-01 00:00:44,630423337322,grocery_pos,107.23,F,48.89,-118.21,149,1978-06-21,49.16,-118.19,0,0.32,0.32,"(49.159047, -118.186462)","(48.8878, -118.2105)",30.22,-1.48
2,2019-01-01 00:00:51,38859492057661,entertainment,220.11,M,42.18,-112.26,4154,1962-01-19,43.15,-112.15,0,1.29,2.87,"(43.150704, -112.154481)","(42.1808, -112.262)",108.1,1.16
3,2019-01-01 00:01:16,3534093764340240,gas_transport,45.0,M,46.23,-112.11,1939,1967-01-12,47.03,-112.56,0,-0.13,-1.05,"(47.034331, -112.561071)","(46.2306, -112.1138)",95.69,0.82
4,2019-01-01 00:03:06,375534208663984,misc_pos,41.96,M,38.42,-79.46,99,1986-03-28,38.67,-78.63,0,-0.57,1.31,"(38.674999, -78.632459)","(38.4207, -79.4629)",77.7,0.06


In [None]:
df.isna().mean()

trans_date_trans_time   0.00
cc_num                  0.00
category                0.00
amt                     0.00
gender                  0.00
lat                     0.00
long                    0.00
city_pop                0.00
dob                     0.00
merch_lat               0.00
merch_long              0.00
is_fraud                0.00
amt_z_score             0.00
cat_z_score             0.00
merch_coord             0.00
cust_coord              0.00
distance                0.00
distance_z_score        0.00
dtype: float64

In [None]:
df['age'] = 2021 - pd.to_datetime(df['dob']).dt.year # 만 나이

In [None]:
df.age

0          33
1          43
2          59
3          54
4          35
           ..
1852389    55
1852390    22
1852391    40
1852392    56
1852393    28
Name: age, Length: 1852394, dtype: int64

In [None]:
df.drop(['cc_num', 'lat', 'long',
         'merch_lat', 'merch_long', 'dob',
         'merch_coord', 'cust_coord'], axis=1, inplace=True)

In [None]:
df = pd.get_dummies(df, columns = ['category', 'gender'], drop_first=True)

In [None]:
df

Unnamed: 0,trans_date_trans_time,amt,city_pop,is_fraud,amt_z_score,cat_z_score,distance,distance_z_score,age,category_food_dining,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
0,2019-01-01 00:00:18,4.97,3495,0,-0.66,-0.69,78.77,0.03,33,0,...,0,0,0,1,0,0,0,0,0,0
1,2019-01-01 00:00:44,107.23,149,0,0.32,0.32,30.22,-1.48,43,0,...,0,0,0,0,0,0,0,0,0,0
2,2019-01-01 00:00:51,220.11,4154,0,1.29,2.87,108.10,1.16,59,0,...,0,0,0,0,0,0,0,0,0,1
3,2019-01-01 00:01:16,45.00,1939,0,-0.13,-1.05,95.69,0.82,54,0,...,0,0,0,0,0,0,0,0,0,1
4,2019-01-01 00:03:06,41.96,99,0,-0.57,1.31,77.70,0.06,35,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,2020-12-31 23:59:07,43.77,519,0,-0.17,-0.05,77.03,0.05,55,0,...,1,0,0,0,0,0,0,0,0,1
1852390,2020-12-31 23:59:09,111.84,28739,0,0.36,1.16,100.02,0.69,22,0,...,0,0,1,0,0,0,0,0,0,1
1852391,2020-12-31 23:59:15,86.88,3684,0,-0.02,0.44,80.89,0.29,40,0,...,0,0,1,0,0,0,0,0,0,0
1852392,2020-12-31 23:59:24,7.99,129,0,-0.59,-0.01,53.06,-0.73,56,0,...,0,0,0,0,0,0,0,0,1,1


In [None]:
df.set_index('trans_date_trans_time', inplace=True)

In [None]:
df.index # 2020-07-01

DatetimeIndex(['2019-01-01 00:00:18', '2019-01-01 00:00:44',
               '2019-01-01 00:00:51', '2019-01-01 00:01:16',
               '2019-01-01 00:03:06', '2019-01-01 00:04:08',
               '2019-01-01 00:04:42', '2019-01-01 00:05:08',
               '2019-01-01 00:05:18', '2019-01-01 00:06:01',
               ...
               '2020-12-31 23:57:18', '2020-12-31 23:57:50',
               '2020-12-31 23:57:56', '2020-12-31 23:58:04',
               '2020-12-31 23:58:34', '2020-12-31 23:59:07',
               '2020-12-31 23:59:09', '2020-12-31 23:59:15',
               '2020-12-31 23:59:24', '2020-12-31 23:59:34'],
              dtype='datetime64[ns]', name='trans_date_trans_time', length=1852394, freq=None)

In [None]:
train = df[df.index < '2020-07-01']
test = df[df.index >= '2020-07-01']

In [None]:
X_train = train.drop('is_fraud', axis=1)
y_train = train['is_fraud']
X_test = test.drop('is_fraud', axis=1)
y_test = test['is_fraud']

In [None]:
import lightgbm as lgb

In [None]:
model = lgb.LGBMClassifier(random_state=100)
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9964749144410561