<a href="https://colab.research.google.com/github/kiyong21c/kaggle/blob/main/20220819_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LightGBM
 - XGBoost 이후 나온 최신 부스팅 모델(리프 중심 트리 분할 방식)
 - 장점 : XGBoost보다 빠르고, 높은 정확도
 - 단점 : 해석이 어려움, 하이퍼파라미터 튜닝 어려움

# 카드 거래 내역 데이터셋을 이용한 이상거래 예측
- 알고리즘 : LightGBM
- 문제유형 : 분류
- 사용모델 : LGBMClassifier, train
- 평가지표 : 정확도, 혼동행렬, 분류 리포트, ROC_AUC 점수

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

file_url = 'http://media.githubusercontent.com/media/musthave-ML10/data_source/main/fraud.csv'
data = pd.read_csv(file_url)

In [2]:
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


 - 변수가 22개 이므로 기본 출력 화면에 다담지 못해 중간에 ...으로 표시됨(기본 20개 컬럼만 출력)

In [3]:
pd.options.display.max_columns = 40 # 40개 컬럼까지 출력되도록 설정

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  object 
 1   cc_num                 int64  
 2   merchant               object 
 3   category               object 
 4   amt                    float64
 5   first                  object 
 6   last                   object 
 7   gender                 object 
 8   street                 object 
 9   city                   object 
 10  state                  object 
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    object 
 16  dob                    object 
 17  trans_num              object 
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
dtypes: float64(5), int

 - 평소와 달리 Non-null count가 나오지 않음 : 데이터가 너무 큰 경우 생략됨 → show_counts 매개변수로 강제로 보이게 함

In [5]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1852394 non-null  object 
 1   cc_num                 1852394 non-null  int64  
 2   merchant               1852394 non-null  object 
 3   category               1852394 non-null  object 
 4   amt                    1852394 non-null  float64
 5   first                  1852394 non-null  object 
 6   last                   1852394 non-null  object 
 7   gender                 1852394 non-null  object 
 8   street                 1852394 non-null  object 
 9   city                   1852394 non-null  object 
 10  state                  1852394 non-null  object 
 11  zip                    1852394 non-null  int64  
 12  lat                    1852394 non-null  float64
 13  long                   1852394 non-null  float64
 14  city_pop          

 - 결측치 없음
 - trans_data_trans_time은 날짜/시간 형태의 데이터 이지만 자료형이 object 형식
 - 날짜/시간 관련 함수를 적용하기 위해서는 datetime 형식으로 변환 필요

In [6]:
round(data.describe(), 2)

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0
mean,4.17386e+17,70.06,48813.26,38.54,-90.23,88643.67,1358674000.0,38.54,-90.23,0.01
std,1.309115e+18,159.25,26881.85,5.07,13.75,301487.62,18195080.0,5.11,13.76,0.07
min,60416210000.0,1.0,1257.0,20.03,-165.67,23.0,1325376000.0,19.03,-166.67,0.0
25%,180042900000000.0,9.64,26237.0,34.67,-96.8,741.0,1343017000.0,34.74,-96.9,0.0
50%,3521417000000000.0,47.45,48174.0,39.35,-87.48,2443.0,1357089000.0,39.37,-87.44,0.0
75%,4642255000000000.0,83.1,72042.0,41.94,-80.16,20328.0,1374581000.0,41.96,-80.25,0.0
max,4.992346e+18,28948.9,99921.0,66.69,-67.95,2906700.0,1388534000.0,67.51,-66.95,1.0


 - amt, zip 변수의 max값이 급격히 높음 → 아웃라이어 의심
 - is_fraud 사기여부 변수의 평균값이 0.01로 이상거래인 경우가 1%인 데이터 → 정확도가 99%인 모델을 만들더라도 좋은 모델이라는 보장이 없다
 - 한쪽으로 치우친 데이터 : 비대칭 데이터

## 전처리 : 데이터 클리닝
 - 불필요한 변수 제외

In [7]:
data.drop(['first','last','street','city','state','zip','trans_num',
           'unix_time','job','merchant'], axis=1, inplace=True)

 - trans_date_time 변수를 datetime 형으로 수정

In [8]:
data['trans_date_trans_time'] = data['trans_date_trans_time'].astype('datetime64[ns]')
# pd.to_datetime(data['trans_date_trans_time'']) # datetime으로 바꾸는 다른 방법

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 12 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   category               object        
 3   amt                    float64       
 4   gender                 object        
 5   lat                    float64       
 6   long                   float64       
 7   city_pop               int64         
 8   dob                    object        
 9   merch_lat              float64       
 10  merch_long             float64       
 11  is_fraud               int64         
dtypes: datetime64[ns](1), float64(5), int64(3), object(3)
memory usage: 169.6+ MB


## 전처리 : 피처 엔지니어링
 - 기존 거래 패턴에서 벗어나는 경우를 감지

  > 갑자기 다른 지역에서 고액의 물건을 구매

  > 한 번도 이용한적 없는 종류의 매장에서 고액의 물건을 구매

### 결제 금액
 - 평소와 다른 고액 결제
 - Z점수 사용(표준값, 표준점수)

  > 특정값이 정규분포 범위에서 어느 수준에 위치하는가

  > 평균과 표준편차 활용

In [10]:
# cc_num 변수 : 카드번호로써 각 개인 id 처럼 활용
# amt 변수 : 거래 금액
data.groupby('cc_num').agg(['mean','std'])['amt']
# 각 개인별 모든 변수에 대해 평균, 표준편차를 구하고 그 중 'amt'변수에 대한 평균, 표준편차만 가져옴

Unnamed: 0_level_0,mean,std
cc_num,Unnamed: 1_level_1,Unnamed: 2_level_1
60416207185,59.257796,142.869746
60422928733,65.483159,92.042844
60423098130,96.376084,1000.693872
60427851591,107.487550,131.014534
60487002085,64.096925,153.207660
...,...,...
4958589671582726883,67.205600,137.504101
4973530368125489546,75.789148,258.847061
4980323467523543940,70.709484,119.903167
4989847570577635369,93.008939,128.396760


In [11]:
amt_info = data.groupby('cc_num').agg(['mean','std'])['amt'].reset_index()

In [12]:
amt_info.head()

Unnamed: 0,cc_num,mean,std
0,60416207185,59.257796,142.869746
1,60422928733,65.483159,92.042844
2,60423098130,96.376084,1000.693872
3,60427851591,107.48755,131.014534
4,60487002085,64.096925,153.20766


 - mean, std 변수를 cc_num을 키값으로 기존 data에 merge

In [13]:
data.merge(amt_info) # on='cc_num' 생략가능 : 키 값이 하나인 경우 자동으로 찾아줌

Unnamed: 0,trans_date_trans_time,cc_num,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud,mean,std
0,2019-01-01 00:00:18,2703186189652095,misc_net,4.97,F,36.0788,-81.1781,3495,1988-03-09,36.011293,-82.048315,0,89.408743,127.530101
1,2019-01-01 16:53:34,2703186189652095,kids_pets,114.79,F,36.0788,-81.1781,3495,1988-03-09,35.083586,-80.625951,0,89.408743,127.530101
2,2019-01-01 23:02:37,2703186189652095,kids_pets,16.55,F,36.0788,-81.1781,3495,1988-03-09,35.532409,-82.170017,0,89.408743,127.530101
3,2019-01-03 01:40:38,2703186189652095,misc_net,27.70,F,36.0788,-81.1781,3495,1988-03-09,35.315851,-80.666723,0,89.408743,127.530101
4,2019-01-03 15:24:11,2703186189652095,entertainment,43.72,F,36.0788,-81.1781,3495,1988-03-09,36.956094,-80.343520,0,89.408743,127.530101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,2020-12-22 22:05:48,2242176657877538,shopping_pos,1041.51,M,34.6323,-89.8855,14462,1959-03-03,34.573471,-89.911011,1,686.806000,419.152774
1852390,2020-12-22 22:18:07,2242176657877538,shopping_pos,868.09,M,34.6323,-89.8855,14462,1959-03-03,34.091227,-90.390612,1,686.806000,419.152774
1852391,2020-12-22 22:31:48,2242176657877538,shopping_net,1039.42,M,34.6323,-89.8855,14462,1959-03-03,34.628434,-90.284780,1,686.806000,419.152774
1852392,2020-12-22 23:06:03,2242176657877538,grocery_pos,289.27,M,34.6323,-89.8855,14462,1959-03-03,34.746063,-90.401093,1,686.806000,419.152774


In [14]:
data = data.merge(amt_info)

In [15]:
(data['amt'] - data['mean']) / data['std'] # Z score 계산

0         -0.662108
1          0.199022
2         -0.571306
3         -0.483876
4         -0.358259
             ...   
1852389    0.846240
1852390    0.432501
1852391    0.841254
1852392   -0.948427
1852393    0.189845
Length: 1852394, dtype: float64

In [16]:
data['amt_z_score'] = (data['amt'] - data['mean']) / data['std'] # Z score 별도의 변수로 생성

In [17]:
data[['amt','mean','std','amt_z_score']].head()

Unnamed: 0,amt,mean,std,amt_z_score
0,4.97,89.408743,127.530101,-0.662108
1,114.79,89.408743,127.530101,0.199022
2,16.55,89.408743,127.530101,-0.571306
3,27.7,89.408743,127.530101,-0.483876
4,43.72,89.408743,127.530101,-0.358259


In [18]:
data.drop(['mean','std'], axis=1, inplace=True) # amt_z_score 변수를 생성했으니, mean, std변수는 제거

### 범주
 - 각 개인마다(cc_num) 어떤 범주에(category) 얼마만큼의 금액을 쓰는지(amt)

In [19]:
data.groupby(['cc_num','category']).agg(['mean','std'])['amt'].reset_index()

Unnamed: 0,cc_num,category,mean,std
0,60416207185,entertainment,51.838855,65.485714
1,60416207185,food_dining,26.737097,46.382603
2,60416207185,gas_transport,59.779429,15.758267
3,60416207185,grocery_net,52.152973,17.694871
4,60416207185,grocery_pos,101.557761,21.894156
...,...,...,...,...
13166,4992346398065154184,misc_pos,60.003043,167.226191
13167,4992346398065154184,personal_care,48.777227,49.523818
13168,4992346398065154184,shopping_net,86.280136,217.047242
13169,4992346398065154184,shopping_pos,64.213333,169.239073


In [20]:
category_info = data.groupby(['cc_num','category']).agg(['mean','std'])['amt'].reset_index()

In [21]:
data = data.merge(category_info, on=['cc_num','category'], how='left') # 키값에 cc_num과 category 순서로 입력

In [22]:
data['cat_z_score'] = (data['amt'] - data['mean']) / data['std']
data.drop(['mean','std'], axis=1, inplace=True)

### 거리
 - 고객위치와 상점위치간의 거리 계산
 - 거리에 대한 Z점수 활용 → 기존 패턴에서 벗어난 거래 감지 가능

 - 두 지점 거리 계산에 geopy 라이브러리 사용
  
   > geopy 라이브러리의 distance 모듈

   >
    ```
    geopy.distance.distance((위도1, 경도1), (위도2, 경도2)).km
    # lat(위도), long(경도)
    ```



In [23]:
import geopy.distance

In [24]:
zip(data['merch_lat'], data['merch_long'])

<zip at 0x7f20897fcd70>

In [25]:
data['merch_coord'] = pd.Series(zip(data['merch_lat'], data['merch_long'])) # 상점 위치
data['cust_coord'] = pd.Series(zip(data['lat'], data['long']))

 - geopy.distance.distance() 함수를 DataFrame에 apply()로 적용할 예정

In [26]:
# geopy.distance.distance(data['merch_coord'], data['cust_coord']) # 에러 발생
# ValueError: When creating a Point from sequence, it must not have more than 3 items.

In [27]:
geopy.distance.distance(data['merch_coord'][0], data['cust_coord'][0]) # 한 건씩 계산되는 함수(시리즈간의 브로드캐스팅 계산이 안되네)

Distance(78.77382075373654)

 - apply()를 활용하여 복수의 변수를 활용한 func를 적용 + DataFrame의 여러 데이터를 한번에 처리할 수 있다

 - DataFrame.apply()에서 DataFrame을 인자로 받을때는 lambda x를 활용하면, 복수의 변수를 활용하는 func를 적용할 수 있다

In [28]:
import time

start_time = time.time()
data['distance'] = data.apply(lambda x : geopy.distance.distance(x['merch_coord'], x['cust_coord']), axis=1)
# data['distance'] = data.apply(lambda x : geopy.distance.distance(x['merch_coord'], x['cust_coord']).km, axis=1) # km를 설정해주면 float으로 인식
end_time = time.time()
print(end_time - start_time)

1660892229.9118953 1660891738.9733617


In [56]:
# data['distance']이 object 형식으로 agg()통한 계산에서 생략되버림
# data['distance']를 처음부터 distance().km 설정을 하거나, 이후 astype()을 통해 float 형식으로 변경
data['distance'] = data['distance'].astype('str').str[:-3].astype('float64')

In [58]:
distance_info = data.groupby('cc_num').agg(['mean','std'])['distance'].reset_index() # cc_num별 거리정보 계산
data = data.merge(distance_info, on='cc_num', how='left')  # 데이터 합치기
data['distance_z_score'] = (data['distance']-data['mean']) / data['std'] # Z-score 계산
data.drop(['mean','std'], axis=1, inplace=True) # 변수 제거
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud,amt_z_score,cat_z_score,merch_coord,cust_coord,distance,distance_z_score
0,2019-01-01 00:00:18,2703186189652095,misc_net,4.97,F,36.0788,-81.1781,3495,1988-03-09,36.011293,-82.048315,0,-0.662108,-0.688297,"(36.011293, -82.048315)","(36.0788, -81.1781)",78.773821,0.030974
1,2019-01-01 16:53:34,2703186189652095,kids_pets,114.79,F,36.0788,-81.1781,3495,1988-03-09,35.083586,-80.625951,0,0.199022,1.022751,"(35.083586, -80.625951)","(36.0788, -81.1781)",121.231332,1.442813
2,2019-01-01 23:02:37,2703186189652095,kids_pets,16.55,F,36.0788,-81.1781,3495,1988-03-09,35.532409,-82.170017,0,-0.571306,-0.847664,"(35.532409, -82.170017)","(36.0788, -81.1781)",108.226552,1.010365
3,2019-01-03 01:40:38,2703186189652095,misc_net,27.7,F,36.0788,-81.1781,3495,1988-03-09,35.315851,-80.666723,0,-0.483876,-0.492467,"(35.315851, -80.666723)","(36.0788, -81.1781)",96.477834,0.619685
4,2019-01-03 15:24:11,2703186189652095,entertainment,43.72,F,36.0788,-81.1781,3495,1988-03-09,36.956094,-80.34352,0,-0.358259,-0.665418,"(36.956094, -80.34352)","(36.0788, -81.1781)",122.740761,1.493006


### 나이 구하기
 - Pandas의 Series에서 dt함수를 사용하면 해당 값에 대 한 년, 월, 일 등을 분리해낼 수 있음

In [67]:
from datetime import datetime

In [85]:
# datetime.strptime(data['dob'], "%m/%d/%Y") # strptime()의 파라미터에 시리즈가 들어갈 수 없음
# TypeError: strptime() argument 1 must be str, not Series

In [80]:
data['dob'] # 1999-01-01 의 object 형식을 datetime 으로 바꿔야 함

0          1988-03-09
1          1988-03-09
2          1988-03-09
3          1988-03-09
4          1988-03-09
              ...    
1852389    1959-03-03
1852390    1959-03-03
1852391    1959-03-03
1852392    1959-03-03
1852393    1959-03-03
Name: dob, Length: 1852394, dtype: object

 - 1900-01-01 등의 object형식의 Series를 datetime 형식으로 변경하는 방법

  > 첫번째방법 : pandas 활용(포맷지정은 선택사항, 간편)
```
pd.to_datetime(Series)
```
  > 두번째방법 : datetime 모듈 활용(포맷지정 필수, 빠른 연산)
```
from datetime import datetime
Series.apply(lambda x: datetime.strptime(x, format)
```
 > ※ 주의사항 : datetime.strptime(Series, format) 오류 → Series 사용불가


In [84]:
# 첫번째방법
pd.to_datetime(data['dob'])

0         1988-03-09
1         1988-03-09
2         1988-03-09
3         1988-03-09
4         1988-03-09
             ...    
1852389   1959-03-03
1852390   1959-03-03
1852391   1959-03-03
1852392   1959-03-03
1852393   1959-03-03
Name: dob, Length: 1852394, dtype: datetime64[ns]

In [83]:
# 두번째방법
data['dob'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))

0         1988-03-09
1         1988-03-09
2         1988-03-09
3         1988-03-09
4         1988-03-09
             ...    
1852389   1959-03-03
1852390   1959-03-03
1852391   1959-03-03
1852392   1959-03-03
1852393   1959-03-03
Name: dob, Length: 1852394, dtype: datetime64[ns]

In [88]:
# datetime 형식의 Series에서 dt.year 사용가능
data['year'] = data['dob'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d")).dt.year

In [89]:
data.drop(['cc_num','lat','long','merch_lat','merch_long','dob','merch_coord','cust_coord'], axis=1, inplace=True)
data.head()

Unnamed: 0,trans_date_trans_time,category,amt,gender,city_pop,is_fraud,amt_z_score,cat_z_score,distance,distance_z_score,year
0,2019-01-01 00:00:18,misc_net,4.97,F,3495,0,-0.662108,-0.688297,78.773821,0.030974,1988
1,2019-01-01 16:53:34,kids_pets,114.79,F,3495,0,0.199022,1.022751,121.231332,1.442813,1988
2,2019-01-01 23:02:37,kids_pets,16.55,F,3495,0,-0.571306,-0.847664,108.226552,1.010365,1988
3,2019-01-03 01:40:38,misc_net,27.7,F,3495,0,-0.483876,-0.492467,96.477834,0.619685,1988
4,2019-01-03 15:24:11,entertainment,43.72,F,3495,0,-0.358259,-0.665418,122.740761,1.493006,1988


### 새 변수 만들기
 - object 변수인 category와 gender는 더미 변수로 변환하여 활용

In [90]:
data = pd.get_dummies(data, columns=['category','gender'], drop_first=True)

 - trans_date_trans_time은 예측에는 필요 X
 - 훈련셋을 분리시키는데 활용
 - 모델링에는 활용하지 않으므로 index로 설정

In [91]:
data.set_index('trans_date_trans_time', inplace=True)
data.head()

Unnamed: 0_level_0,amt,city_pop,is_fraud,amt_z_score,cat_z_score,distance,distance_z_score,year,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
trans_date_trans_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019-01-01 00:00:18,4.97,3495,0,-0.662108,-0.688297,78.773821,0.030974,1988,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2019-01-01 16:53:34,114.79,3495,0,0.199022,1.022751,121.231332,1.442813,1988,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2019-01-01 23:02:37,16.55,3495,0,-0.571306,-0.847664,108.226552,1.010365,1988,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2019-01-03 01:40:38,27.7,3495,0,-0.483876,-0.492467,96.477834,0.619685,1988,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2019-01-03 15:24:11,43.72,3495,0,-0.358259,-0.665418,122.740761,1.493006,1988,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [101]:
# 최종 DataFrame 저장
data.to_csv('/content/drive/MyDrive/Colab Notebooks/lgbdata.csv')            # DataFrame 저장
# data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/lgbdata.csv')   # DataFrame 로드

## 모델링 및 평가하기
 - 지금까지 발생한 거래 데이터를 기반으로(과거)
 - 앞으로 일어나는 거래에 대한 이상여부 예측
 - 따라서, train_test_split()사용 X
 - 특정 날짜를 기준으로 훈련셋/시험셋 나눔

  > 2020년7월~12월 데이터를 시험셋

In [94]:
train = data[data.index < '2020-07-01'] # 훈련셋 설정
test = data[data.index >= '2020-07-01'] # 시험셋 설정

In [95]:
len(test) / len(data) # 시험셋 비율

0.2837738623640543

In [96]:
X_train = train.drop('is_fraud', axis=1)
X_test = test.drop('is_fraud', axis=1)
y_train = train['is_fraud']
y_test = test['is_fraud']

In [97]:
import lightgbm as lgb

In [98]:
model_1 = lgb.LGBMClassifier(random_state=100)
model_1.fit(X_train, y_train)
pred_1 = model_1.predict(X_test)

In [99]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [102]:
import joblib # 

joblib.dump(model_1, '/content/drive/MyDrive/Colab Notebooks/lgb_model_1.pkl') # 모델 저장하기
# model_1 = joblib.load('/content/drive/MyDrive/Colab Notebooks/lgb_model_1.pkl') # 

['/content/drive/MyDrive/Colab Notebooks/lgb_model_1.pkl']