# LightGBM

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

file_url = 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/fraud.csv'
data = pd.read_csv(file_url)

In [2]:
pd.options.display.max_columns = 40

In [3]:
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
data.info(show_counts=True) #데이터가 너무 많으면 not-null count가 안나오는데 강제로 보이도록 한다

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1852394 non-null  object 
 1   cc_num                 1852394 non-null  int64  
 2   merchant               1852394 non-null  object 
 3   category               1852394 non-null  object 
 4   amt                    1852394 non-null  float64
 5   first                  1852394 non-null  object 
 6   last                   1852394 non-null  object 
 7   gender                 1852394 non-null  object 
 8   street                 1852394 non-null  object 
 9   city                   1852394 non-null  object 
 10  state                  1852394 non-null  object 
 11  zip                    1852394 non-null  int64  
 12  lat                    1852394 non-null  float64
 13  long                   1852394 non-null  float64
 14  city_pop          

In [5]:
round(data.describe(), 2)

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0
mean,4.17386e+17,70.06,48813.26,38.54,-90.23,88643.67,1358674000.0,38.54,-90.23,0.01
std,1.309115e+18,159.25,26881.85,5.07,13.75,301487.62,18195080.0,5.11,13.76,0.07
min,60416210000.0,1.0,1257.0,20.03,-165.67,23.0,1325376000.0,19.03,-166.67,0.0
25%,180042900000000.0,9.64,26237.0,34.67,-96.8,741.0,1343017000.0,34.74,-96.9,0.0
50%,3521417000000000.0,47.45,48174.0,39.35,-87.48,2443.0,1357089000.0,39.37,-87.44,0.0
75%,4642255000000000.0,83.1,72042.0,41.94,-80.16,20328.0,1374581000.0,41.96,-80.25,0.0
max,4.992346e+18,28948.9,99921.0,66.69,-67.95,2906700.0,1388534000.0,67.51,-66.95,1.0


In [6]:
# is_fraud의 mean 값이 0.01로 매우 작은데 이럴 경우에는 99%이더라도 좋은 모델이라고 보장할 수 없다. => 비대칭 데이터, 오버샘플링 해주어서 예측 정확도를 높일 수 있다

In [7]:
data.drop(['first', 'last', 'street', 'city', 'state', 'zip', 'trans_num', 'unix_time', 'job', 'merchant'], axis=1, inplace=True)

In [8]:
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time']) # 날짜형식으로 변환

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 12 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   category               object        
 3   amt                    float64       
 4   gender                 object        
 5   lat                    float64       
 6   long                   float64       
 7   city_pop               int64         
 8   dob                    object        
 9   merch_lat              float64       
 10  merch_long             float64       
 11  is_fraud               int64         
dtypes: datetime64[ns](1), float64(5), int64(3), object(3)
memory usage: 169.6+ MB


### 피처엔지니어링 - 1) 결제금액

In [10]:
# 평균과 표준편차는 가 ㄱ개인에 대한 거래 내역이므로 cc_num 기준으로 groupby()
amt_info = data.groupby('cc_num').agg(['mean', 'std'])['amt'].reset_index()

  amt_info = data.groupby('cc_num').agg(['mean', 'std'])['amt'].reset_index()


In [11]:
amt_info.head()

Unnamed: 0,cc_num,mean,std
0,60416207185,59.257796,142.869746
1,60422928733,65.483159,92.042844
2,60423098130,96.376084,1000.693872
3,60427851591,107.48755,131.014534
4,60487002085,64.096925,153.20766


In [12]:
data = data.merge(amt_info, on = 'cc_num', how='left')

In [13]:
data['amt_z_score'] = (data['amt'] - data['mean']) / data['std']

In [14]:
data[['amt', 'mean', 'std', 'amt_z_score']].head()

Unnamed: 0,amt,mean,std,amt_z_score
0,4.97,89.408743,127.530101,-0.662108
1,107.23,56.078113,159.201852,0.321302
2,220.11,69.924272,116.688602,1.287064
3,45.0,80.09004,280.07788,-0.125287
4,41.96,95.341146,94.322842,-0.565941


In [15]:
data.drop(['mean', 'std'], axis = 1, inplace=True)

### 피처엔지니어링 - 2) 범주

In [16]:
category_info = data.groupby(['cc_num', 'category']).agg(['mean', 'std'])['amt'].reset_index()

  category_info = data.groupby(['cc_num', 'category']).agg(['mean', 'std'])['amt'].reset_index()


In [17]:
data = data.merge(category_info, on=['cc_num', 'category'], how='left')

In [18]:
data['cat_z_score'] = (data['amt'] - data['mean']) / data['std']
data[['cat_z_score', 'amt', 'mean', 'std']]

Unnamed: 0,cat_z_score,amt,mean,std
0,-0.688297,4.97,84.860809,116.070300
1,0.317631,107.23,99.637224,23.904424
2,2.872509,220.11,46.653103,60.385161
3,-1.050197,45.00,61.537283,15.746841
4,1.312866,41.96,35.481357,4.934731
...,...,...,...,...
1852389,-0.047862,43.77,45.930979,45.150024
1852390,1.163822,111.84,50.923503,52.341751
1852391,0.440137,86.88,63.856707,52.309370
1852392,-0.007423,7.99,8.192245,27.244418


In [19]:
data.drop(['mean', 'std'], axis=1, inplace=True)

### 피처엔지니어링 - 3) 거리

In [20]:
import geopy.distance

In [21]:
# geop.distance.distance((위도1, 경도1), (위도2, 경도2))

In [22]:
#data['merch_coord'] = pd.Series(zip(data['merch_lat'], data['merch_long']))

In [23]:
#data['cust_coord'] = pd.Series(zip(data['lat'], data['long']))

In [24]:
# data[['merch_coord', 'cust_coord']].head()

In [25]:
# import time

In [26]:
# start_time  = time.time()
# data['distance'] = data.apply(lambda x: geopy.distance.distance(x['merch_coord'], x['cust_coord']).km, axis=1)
# end_time = time.time()
# print(end_time - start_time)

In [27]:
# data['distance']

In [28]:
# distance_info = data.groupby('cc_num').agg(['mean', 'std'])['distance'].reset_index()
# data = data.merge(distance_info, on ='cc_num', how ='left')
# data['distance_z_score'] = (data['distance'] - data['mean']) / data['std']
# data.drop(['mean', 'std'], axis = 1, inplace=True)
# data.head()

### 피처엔지니어링 - 4) 나이

In [29]:
data['age'] = 2024 - pd.to_datetime(data['dob']).dt.year
data[['dob', 'age']]

Unnamed: 0,dob,age
0,1988-03-09,36
1,1978-06-21,46
2,1962-01-19,62
3,1967-01-12,57
4,1986-03-28,38
...,...,...
1852389,1966-02-13,58
1852390,1999-12-27,25
1852391,1981-11-29,43
1852392,1965-12-15,59


In [30]:
# data.drop(['cc_num', 'lat', 'long', 'merch_lat', 'merch_long', 'dob', 'merch_coord', 'cust_coord'], axis=1, inplace=True)
data.drop(['cc_num', 'lat', 'long', 'merch_lat', 'merch_long', 'dob'], axis=1, inplace=True)

data.head()

Unnamed: 0,trans_date_trans_time,category,amt,gender,city_pop,is_fraud,amt_z_score,cat_z_score,age
0,2019-01-01 00:00:18,misc_net,4.97,F,3495,0,-0.662108,-0.688297,36
1,2019-01-01 00:00:44,grocery_pos,107.23,F,149,0,0.321302,0.317631,46
2,2019-01-01 00:00:51,entertainment,220.11,M,4154,0,1.287064,2.872509,62
3,2019-01-01 00:01:16,gas_transport,45.0,M,1939,0,-0.125287,-1.050197,57
4,2019-01-01 00:03:06,misc_pos,41.96,M,99,0,-0.565941,1.312866,38


In [31]:
data = pd.get_dummies(data, columns = ['category', 'gender'], drop_first = True)

In [32]:
#  trans_date_trans_time은 예측에 필요하지 않지만 훈련셋을 분리시키는데 활용예정. 
# 모델링할 때는 필요하지 않으므로 Index로 들어가도록 수정
data.set_index('trans_date_trans_time', inplace=True)
data.head()

Unnamed: 0_level_0,amt,city_pop,is_fraud,amt_z_score,cat_z_score,age,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
trans_date_trans_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-01 00:00:18,4.97,3495,0,-0.662108,-0.688297,36,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2019-01-01 00:00:44,107.23,149,0,0.321302,0.317631,46,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2019-01-01 00:00:51,220.11,4154,0,1.287064,2.872509,62,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2019-01-01 00:01:16,45.0,1939,0,-0.125287,-1.050197,57,0,1,0,0,0,0,0,0,0,0,0,0,0,1
2019-01-01 00:03:06,41.96,99,0,-0.565941,1.312866,38,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [33]:
# 지금까지 발생한 거래 데이터를 기반으로 모델을 학습시키고, 그 모델을 이용해 앞으로 일어나는 거래에 대한 이상 여부를 예측해야 한다.
train = data[data.index < '2020-07-01']
test = data[data.index >= '2020-07-01']
len(test) / len(train) #시험 셋 비율

0.3962070740684071

In [34]:
X_train = train.drop('is_fraud', axis = 1)
X_test = test.drop('is_fraud', axis = 1)
y_train = train['is_fraud']
y_test = test['is_fraud']

In [35]:
import lightgbm as lgb

In [36]:
# LIghtGBM을 학습시키는 함수로는 회귀, 분류, train()함수 등이 있다 
# model_1 = lgb.LGBMClassifier(random_state = 100)
# model_1.fit(X_train, y_train)
# pred_1 = model_1.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score 

accuracy_score(y_test, pred_1)

NameError: name 'pred_1' is not defined

In [None]:
# proba_1 = model_1.predict_proba(X_test) #예측
# proba_1 # 0에 대한 예측값, 1에 대한 예측값

In [None]:
# proba_int1 = (proba_1 > 0.2).astype('int')
# proba_int2 = (proba_1 > 0.8).astype('int')

In [None]:
# print(confusion_matrix(y_test, proba_int1))