# 사기탐지 신용카드 데이터분석

- 신원 도용, 계정 인수, 거래 사기를 포함하되 이에 국한되지 않는 다양한 유형의 사기에 대한 데이터를 수집하는 것이 포함됨

In [70]:
import warnings
warnings.filterwarnings(action='ignore')

# 밑에 실행하면서 뜨는 빨간 경고 팝업 뜨지 않게 하기 

In [2]:
# !pip install koreanize-matplotlib chart-studio pmdarima

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib   # 한국어 출력되게 
import os
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
import plotly.graph_objs as go
import pmdarima as pm
from pmdarima.arima import ndiffs

### 칼럼 설명

- trans_date_trans_time (object): 거래가 발생한 날짜 및 시간
- cc_num (int64): 사용된 신용카드 번호를 나타내는 숫자
- merchant (object): 상점명 또는 거래가 발생한 가맹점의 이름
- category (object): 거래가 발생한 상품 또는 서비스의 범주
- amt (float64): 해당 거래의 거래 금액
- first (object): 카드 소유자의 이름
- last (object): 카드 소유자의 성
- gender (object): 카드 소유자의 성별
- street (object): 카드 소유자의 거주지 주소의 거리
- city (object): 카드 소유자의 거주지 도시
- state (object): 카드 소유자의 거주지 주(State)
- zip (int64): 카드 소유자의 거주지 우편번호
- lat (float64): 카드 소유자의 거주지 위도(latitude)
- long (float64): 카드 소유자의 거주지 경도(longitude)
- city_pop (int64): 카드 소유자가 거주하는 도시의 인구 수
- job (object): 카드 소유자의 직업
- dob (object): 카드 소유자의 출생일(Date of Birth)
- trans_num (object): 각 거래에 할당된 고유 거래 번호
- unix_time (int64): 거래가 발생한 시간을 유닉스 타임스탬프 형식으로 나타낸 값
- merch_lat (float64): 가맹점의 위도(latitude) 거래가 발생한 상점의 위치 정보
- merch_long (float64): 가맹점의 경도(longitude)
- is_fraud (int64): 거래가 사기(fraud)여부를 나타내는 열입니다. 1은 사기, 0은 정상 거래


In [73]:
data = pd.read_csv("./data/creaditcard_fraud_data.csv")
data

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.970000,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.078800,-81.178100,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.230000,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.887800,-118.210500,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.110000,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.180800,-112.262000,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.000000,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.230600,-112.113800,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.960000,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.420700,-79.462900,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,2020-12-31 23:59:07,30560609640617,fraud_Reilly and Sons,health_fitness,43.770000,Michael,Olson,M,558 Michael Estates,Luray,...,40.493100,-91.891200,519,Town planner,1966-02-13,9b1f753c79894c9f4b71f04581835ada,1388534347,39.946837,-91.333331,0
1852390,2020-12-31 23:59:09,3556613125071656,fraud_Hoppe-Parisian,kids_pets,111.840000,Jose,Vasquez,M,572 Davis Mountains,Lake Jackson,...,29.039300,-95.440100,28739,Futures trader,1999-12-27,2090647dac2c89a1d86c514c427f5b91,1388534349,29.661049,-96.186633,0
1852391,2020-12-31 23:59:15,6011724471098086,fraud_Rau-Robel,kids_pets,86.880000,Ann,Lawson,F,144 Evans Islands Apt. 683,Burbank,...,46.196600,-118.901700,3684,Musician,1981-11-29,6c5b7c8add471975aa0fec023b2e8408,1388534355,46.658340,-119.715054,0
1852392,2020-12-31 23:59:24,4079773899158,fraud_Breitenberg LLC,travel,7.990000,Eric,Preston,M,7020 Doyle Stream Apt. 951,Mesa,...,44.625500,-116.449300,129,Cartographer,1965-12-15,14392d723bb7737606b2700ac791b7aa,1388534364,44.470525,-117.080888,0


1. 데이터 전처리(결측값, 이상값)
2. EDA
3. Feature Engineering
4. Feature Selection
5. 데이터 분할
6. k-fold 교차검증, gridsearch, randomsearch, 데이터 증폭 pipeline 이용해서 모델 3가지로 비교 분석


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  object 
 1   cc_num                 int64  
 2   merchant               object 
 3   category               object 
 4   amt                    float64
 5   first                  object 
 6   last                   object 
 7   gender                 object 
 8   street                 object 
 9   city                   object 
 10  state                  object 
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    object 
 16  dob                    object 
 17  trans_num              object 
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
dtypes: float64(5), int

In [6]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1852394 non-null  object 
 1   cc_num                 1852394 non-null  int64  
 2   merchant               1852394 non-null  object 
 3   category               1852394 non-null  object 
 4   amt                    1852394 non-null  float64
 5   first                  1852394 non-null  object 
 6   last                   1852394 non-null  object 
 7   gender                 1852394 non-null  object 
 8   street                 1852394 non-null  object 
 9   city                   1852394 non-null  object 
 10  state                  1852394 non-null  object 
 11  zip                    1852394 non-null  int64  
 12  lat                    1852394 non-null  float64
 13  long                   1852394 non-null  float64
 14  city_pop          

In [7]:
data.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [8]:
# float 형식의 숫자를 소수점 아래 6자리까지 출력

pd.options.display.float_format = '{:.6f}'.format

In [9]:
data.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0
mean,4.173860383937104e+17,70.063567,48813.258191,38.539311,-90.227832,88643.674509,1358674218.834364,38.538976,-90.22794,0.00521
std,1.3091152653187348e+18,159.253975,26881.845966,5.07147,13.747895,301487.618344,18195081.38756,5.105604,13.759692,0.071992
min,60416207185.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376018.0,19.027422,-166.671575,0.0
25%,180042946491150.0,9.64,26237.0,34.6689,-96.798,741.0,1343016823.75,34.740122,-96.89944,0.0
50%,3521417320836166.0,47.45,48174.0,39.3543,-87.4769,2443.0,1357089331.0,39.3689,-87.440694,0.0
75%,4642255475285942.0,83.1,72042.0,41.9404,-80.158,20328.0,1374581485.25,41.956263,-80.245108,0.0
max,4.992346398065154e+18,28948.9,99921.0,66.6933,-67.9503,2906700.0,1388534374.0,67.510267,-66.950902,1.0


### 데이터가 많기 때문에 train 과 test 미리 쪼개기

In [10]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(data, stratify=data['is_fraud'], test_size=0.4, random_state=10)

In [None]:
train['is_fraud'].value_counts()

In [None]:
test['is_fraud'].value_counts()

In [None]:
train

In [None]:
train['cc_num'].nunique()

# 

In [None]:
data['name'] = data['first'] + " " + data['last']
data['name'].nunique()
# 중복되는 값 빼고 이름 총 개수

In [None]:
data.head()

In [None]:
data = data.drop(['first', 'last', 'street','city','zip','trans_num','unix_time', 'name'],axis=1)
data.head(3)

### 카드 번호에 따른 평균 지출액, 표준편차 구하기 

In [None]:
train.groupby('cc_num')['amt'].agg(['mean', 'std']).reset_index()

In [None]:
amt_by_cc_num = train.groupby('cc_num')['amt'].agg(['mean', 'std']).reset_index()
amt_by_cc_num 

In [None]:
train = pd.merge(train, amt_by_cc_num, how='left', on='cc_num')
train.head()

# 맨 뒤에 mean, std 칼럼 합쳐짐

### 이상지출 탐지하기 1
- 평소 사용자가 지출하던 금액보다 과도하게 높은 경우
- z 스코어를 이용해 탐지
- z-score = (원래있던값 - 평균) / 표준편차 

$$
z-score  =  \frac{{\text{amt} - \text{mean}}}{\text{std}}
$$


In [None]:
train['amt_z_score'] = (train['amt'] - train['mean']) / train['std']

In [None]:
train[['amt', 'mean', 'std', 'amt_z_score']]

In [None]:
amt_z_over_3 = train[train['amt_z_score'] > 3]

In [None]:
amt_z_over_3['is_fraud'].mean()

In [None]:
train['is_fraud'].mean()

### 이상지출 탐지하기 2
- 사람별로 지출하는 카테고리가 다르기 때문에 평소 지출하는 카테고리가 아닌 경우를 필터링
- 평소 많이 지출하는 카테고리가 아닌 경우 의심

In [None]:
train[train['cc_num'] == 377895991033232 ]['category'].value_counts()

In [None]:
train['category'].nunique()

In [None]:
amtbycategory = train.groupby(['cc_num', 'category'])['amt'].agg(['mean', 'std'])
amtbycategory

In [None]:
train = pd.merge(train, amtbycategory, how='left', on=['cc_num', 'category'])
train

In [None]:
train

In [None]:
train['category_z_score'] = (train['amt'] - train['mean_y']) / train['std_y']
train.head()

In [None]:
train = train.drop(['mean_x','std_x','mean_y','std_y'], axis=1)
train

In [None]:
train.groupby('is_fraud')['merchant'].value_counts()

In [None]:
train.head(2)

In [None]:
train = train.drop(['first', 'last', 'street','city','zip','trans_num','unix_time'],axis=1)
train.head(3)

In [None]:
happen_fraud = train.groupby(['merchant', 'category'])[['is_fraud']].mean().sort_values(by='is_fraud', ascending=False).reset_index()
happen_fraud 

In [None]:
happen_fraud.columns = ['merchant', 'category', 'is_fraud_rate']
happen_fraud

In [None]:
train = pd.merge(train, happen_fraud, how='left', on=['merchant', 'category'])
train

In [None]:
# 사기 거래 당한 수가 많은 상위 10개 직업군

train[train['is_fraud'] == 1]['job'].value_counts().head(10)

In [None]:
train.groupby('job')['is_fraud'].value_counts()

In [None]:
# 각 city_pop에서 is_fraud가 1인 비율을 계산
fraudbyjob = train.groupby(['job', 'is_fraud'])['is_fraud'].count().unstack()

# is_fraud가 1인 비율 계산 (사기 거래의 비율)
fraudbyjob['fraud_ratio'] = fraudbyjob[1] / fraudbyjob.sum(axis=1)


In [None]:
print(fraudbyjob[['fraud_ratio']].sort_values(by='fraud_ratio', ascending=False))

In [None]:
train['trans_date_trans_time'] = pd.to_datetime(train['trans_date_trans_time'])

In [None]:
train['trans_date_trans_time'].dtype

In [None]:
train['trans_date_trans_time'].apply(lambda x: x.year)

In [None]:
train['age'] = train['dob'].apply(lambda x: 2020-int(x[:4]))

In [None]:
train

### 위도 경도 정보로 거리 계산하기
- 카드 사용자의 주거지 위치와 카드 사용처의 거리를 계산해서 너무 멀면 사기일 가능성이 높다.


In [None]:
#!pip install geopy

In [None]:
import geopy.distance

In [None]:
# 고객의 좌표 (위도, 경도)를 계산해서 'customer_coord' 열에 저장

train['customer_coord'] = pd.Series(zip(train['lat'], train['long']))
train['merchant_coord'] = pd.Series(zip(train['merch_lat'], train['merch_long']))
train

In [None]:
train['distance'] = train.apply(lambda x: geopy.distance.distance(x['customer_coord'], x['merchant_coord']).km, axis=1)

In [None]:
train

In [None]:
train['distance'].describe()

In [None]:
train.columns

In [None]:
spendbydistance = train.groupby('cc_num')['distance'].agg(['mean','std']).reset_index()
spendbydistance

In [None]:
train = pd.merge(train, spendbydistance, how='left', on='cc_num')
train

In [None]:
train['distance_z_score'] = (train['distance'] - train['mean']) / train['std']
train

In [None]:
train = train.drop(['mean', 'std'], axis=1)
train

In [None]:
train.columns

In [None]:
train = train[['gender', 'age', 'amt', 'amt_z_score', 'category_z_score',
       'is_fraud_rate', 'distance_z_score', 'is_fraud']]
train

In [None]:
train = train.dropna()

In [None]:
X = train.drop('is_fraud', axis=1)
y = train['is_fraud']

In [None]:
X = pd.get_dummies(X, drop_first=True)
X

In [None]:
y

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, stratify=y, random_state=10)

In [None]:
sns.boxplot(X_train)

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
rs = RobustScaler()
X_train_scaled = rs.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_scaled

In [None]:
X_valid = rs.transform(X_valid)
X_valid

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
xgb = XGBClassifier(max_depth=5, n_estimators=500, n_jobs=-1, random_state=10)
xgb.fit(X_train_scaled, y_train)
valid_pred = xgb.predict(X_valid)
print(classification_report(y_valid, valid_pred))
print("roc_auc_score: ", roc_auc_score(y_valid, valid_pred))

In [None]:
y.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state=10)
X_smt, y_smt = sm.fit_resample(X_train_scaled, y_train)

In [None]:
y_smt.value_counts()

In [None]:
len(X_valid)

In [None]:
y_valid.value_counts()

In [None]:
xgb = XGBClassifier(max_depth=5, n_estimators=500, n_jobs=-1, random_state=10)
xgb.fit(X_smt, y_smt)
valid_pred = xgb.predict(X_valid)
print(classification_report(y_valid, valid_pred))
print("roc_auc_score: ", roc_auc_score(y_valid, valid_pred))

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
#from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([
    ('rs', RobustScaler()),
    ('smt', SMOTE(random_state=10)),
    ('rfc', RandomForestClassifier(n_jobs=-1, random_state=10))
])

In [None]:
# 랜덤서치를 위한 파라미터 정의

params_rv = {
    'rfc__n_estimators' : [100, 200, 300],
    'rfc__max_depth' : [3, 5, 7],
}

In [None]:
# 교차 검증을 위한 stratify-k-fold

scv = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)

In [None]:
random_cv = RandomizedSearchCV(pipe, param_distributions=params_rv, n_iter=10, 
                              scoring='roc_auc', cv=scv, random_state=10, n_jobs=-1)

random_cv.fit(X_train, y_train)
print("Best params:", random_cv.best_params_)
print("Best score:", random_cv.best_score_)
print("Best model test score:", random_cv.score(X_valid, y_valid))

## 여러개의 모델을 파이프라인으로 분석

In [None]:
models = { 'XGB' : XGBClassifier(n_jobs=-1, random_state=10),
           'RFC' : RandomForestClassifier(n_jobs=-1, random_state=10),
           'LGBM' : LGBMClassifier(n_jobs=-1, random_state=10)
         }
models

In [None]:
params = {
    'XGB': {
        'XGB__max_depth': [3, 5, 7],
        'XGB__n_estimators': [100, 300, 500],
        'XGB__subsample': [0.6, 0.8, 1.0]
    },
    'RFC': {
        'RFC__max_depth': [3, 5, 7],
        'RFC__n_estimators': [100, 300, 500]
    },
    'LGBM': {
        'LGBM__max_depth': [3, 5, 7],
        'LGBM__n_estimators': [100, 300, 500],
        'LGBM__subsample': [0.6, 0.8, 1.0]
    }
}

In [None]:
# 각 모델별로 최적의 파라미터 찾고 성능 평가
best_estimators = {}
roc_auc_scores = {}
classification_reports = {}

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
pipe = Pipeline([
    ('rs', RobustScaler()),
    ('smt', SMOTE(random_state=10)),
    ('lr', LogisticRegression(n_jobs=3, random_state=10))
])

In [None]:
# 시간이 엄청 오래 걸림
# 주석처리 해놓기 

# %%time
for model_name, model in models.items():
    print(model_name, model)
    pipe = Pipeline([('rs', RobustScaler()),
                     ('smt', SMOTE(random_state=10)),
                     (model_name, model)])
    
    rand_cv = RandomizedSearchCV(pipe, param_distributions=params[model_name],
                                 cv=skf, scoring='roc_auc', n_jobs=3)

    rand_cv.fit(X_train, y_train)
    
    print('best_params: ', rand_cv.best_params_)
    best_model = rand_cv.best_estimator_
    best_pred = best_model.predict(X_valid)
    
    print(classification_report(y_valid, valid_pred))
    print('roc_auc_score: ', roc_auc_score(y_valid, valid_pred))


In [None]:
밑에서부턴 돌리지 말기 

# __________________________________________

In [None]:
data['is_fraud'].value_counts()

# 0은 정상 거래
# 1은 사기

In [None]:
data['job'].value_counts()

In [None]:
job_counts = data['job'].value_counts()

for job, count in job_counts.items():
    print(f'{job}: {count}')

In [None]:
data.isnull().sum()

In [None]:
data.head(2)

In [None]:
data['trans_date_trans_time']

In [None]:
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

In [None]:
data.info()

In [None]:
data.head(7)

In [None]:
data = data.drop(['first', 'last', 'street','city','zip','trans_num','unix_time'],axis=1)
data.head(3)

In [None]:
data.info()

In [None]:
#df = df.reset_index(drop=True)

In [None]:
# 이름 + 성 합치기 

data['full_name'] = data['first'] + " " + data['last']
data.head()

In [None]:
data['full_name']

In [None]:
data = data.drop(['first', 'last'],axis=1)
data.head(3)

In [None]:
data

In [None]:
data.columns

In [None]:
# 성별에 따른 사기 (=1) 여부 

data.groupby('gender')['is_fraud'].sum()

# 여자가 사기 비율이 조금 더 높다

In [None]:
# 1은 사기, 0은 정상 거래

pd.crosstab(data['gender'], data['is_fraud'])

In [None]:
data.groupby('gender')['is_fraud'].count()

In [None]:
pd.crosstab(data['state'], data['is_fraud'])

In [None]:
# 주별 사기 거래 수 (is_fraud == 1)
fraud_transactions_by_state = data[data['is_fraud'] == 1].groupby('state')['is_fraud'].count()

# NaN을 0으로 변경 (사기 거래가 없는 주는 비율이 NaN이 되므로 0으로 처리)
fraud_transactions_by_state = fraud_transactions_by_state.fillna(0)

In [None]:
fraud_transactions_by_state.plot(kind='bar', figsize=(10, 6))
plt.title('Number of Fraud Transactions by State')
plt.xlabel('State')
plt.ylabel('Number of Fraud Transactions')
plt.show()

미국에서 NY 뉴욕의 사기 거래 비율이 제일 높은 것을 확인할 수 있다. 

In [None]:
# 각 주별 사기 거래(1) 수가 가장 높은 순서대로 상위 10개 주를 출력

fraud_transactions_by_state = data[data['is_fraud'] == 1].groupby('state')['is_fraud'].count()
top_10_fraud_states = fraud_transactions_by_state.sort_values(ascending=False).head(10)
print(top_10_fraud_states)


* NY: New York (뉴욕)
* TX: Texas (텍사스)
* PA: Pennsylvania (펜실베이니아)
* CA: California (캘리포니아)
* OH: Ohio (오하이오)

In [None]:
# 거래 금액(amt)과 사기 여부의 관계
data['amt']

In [None]:
data.groupby('is_fraud')['amt'].describe()

In [None]:
data.groupby('is_fraud')['amt'].mean()

# 평균 사기 금액 : 530 불 

In [None]:
# 사기 거래 당한 수가 많은 상위 10개 직업군

data[data['is_fraud'] == 1]['job'].value_counts().head(10)

- Quantity surveyor: 건설 원가 계산사
- Naval architect: 선박 건조 기술자
- Materials engineer: 재료 공학자
- Audiological scientist: 청각 과학자
- Senior tax professional/tax inspector: 고위 세무 전문가/세무 조사관
- Trading standards officer: 거래 기준 감독관
- Podiatrist: 족부 전문의
- Film/video editor: 영화/비디오 편집자
- Colour technologist: 색채 기술자
- Exhibition designer: 전시 디자이너