In [1]:
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.api as sm
import numpy as np
import seaborn as sns
from scipy import stats

### Raw 데이터 읽기

In [2]:
# 카테고리
read_path = '../csv/categories.csv'
df_category = pd.read_csv(read_path)

df_category['category'] = df_category['category'].astype(str)
df_category = df_category[['category', 'name']]

df_category.tail()

Unnamed: 0,category,name
1219,240060020,프로그래머
1220,240070040,설문조사
1221,240080060,유아유치원
1222,240100010,매장관리
1223,240100090,기타 업종


In [13]:
# 번개페이 거래 데이터
read_path = '../csv/pay_seller_buyer.csv'
pay_raw = pd.read_csv(read_path)

# change type to str
for col in ['buyer_id', 'seller_id']:
    
    df = pay_raw
    
    df[col] = df[col].astype('str')
    
# order_status_cd 중 아래 3개 제외하기, 결제하기 전 시도에서 drop된 케이스 삭제
# purchase_created: 결제 진행중(주문서 페이지 랜딩)
# purchase_verified: 결제 진행중(결제모듈 떠있는 상태)
# payment_failed: 결제 실패

drop_status = ['purchase_created', 'purchase_verified', 'payment_failed']
pay_raw = pay_raw[~pay_raw.order_status_cd.isin(drop_status)]

pay_raw = pay_raw.rename(columns={'buyer_id': 'buyer_uid', 'seller_id': 'seller_uid'})

pay_raw.tail()

Unnamed: 0,date,order_status_cd,buyer_uid,seller_uid,total_price,category_id
3878869,2019-06-30,in_transit,9031429,5415988,111210,310010010
3878874,2019-06-30,cancelled,942104,6650642,457750,320080100
3878875,2019-06-30,in_transit,3903914,2653493,124600,910100004
3878891,2019-06-30,in_transit,5733409,4416259,132950,600700001
3878896,2019-06-30,refunded,3785052,6699706,152410,900320100


In [14]:
# 거래관련 상담센터 신고 list
read_path = '../csv/help_reporter_frauder.csv'
help_raw = pd.read_csv(read_path)

help_raw['fraud_help'] = 1

help_raw = help_raw.sort_values(['reporter_uid', 'fraud_uid', 'created_at'])  # sort again, ascending

help_raw = help_raw.drop_duplicates(['reporter_uid', 'fraud_uid'])  # drop duplicated row, keep 1st row

# change type to str
for col in ['reporter_uid', 'fraud_uid']:
    
    df = help_raw
    
    df[col] = df[col].astype('str')

help_raw.tail()

Unnamed: 0,discussion_id,created_at,reporter_uid,fraud_uid,fraud_help
197015,838830,2019-07-05 07:56:30,9317104,0,1
196983,838715,2019-07-04 23:36:37,9317700,8767099,1
196782,837888,2019-07-03 21:12:13,9322294,0,1
196784,837893,2019-07-03 21:17:22,9322294,9320707,1
196876,838255,2019-07-04 13:55:10,9323927,4623330,1


In [15]:
# 상담센터 신고 내역 list의 신고 대상자
help_raw_frauder = help_raw.groupby(['fraud_uid'], as_index=False).count()

help_raw_frauder = pd.DataFrame(help_raw_frauder['fraud_uid'])

help_raw_frauder = help_raw_frauder[(help_raw_frauder['fraud_uid'] != '0') & (help_raw_frauder['fraud_uid'] != '1')]

help_raw_frauder['frauder'] = 1

help_raw_frauder.tail()

Unnamed: 0,fraud_uid,frauder
103364,Keycaps,1
103365,UI-Return,1
103366,UIKBInputBackdropView:0x151259440.Height{id: 5...,1
103367,fontName: com.apple.kUIFontMagicSystemFontName...,1
103368,undefined,1


### Merge

In [16]:
pay_help = pd.merge(pay_raw, help_raw, 
                            left_on=['buyer_uid', 'seller_uid'],
                            right_on=['reporter_uid', 'fraud_uid'],
                            how='left')


pay_help['fraud_help'] = pay_help['fraud_help'].fillna(0)
pay_help['month_at'] = pay_help['date'].astype(str).str[:7]

# 1-depth 카테고리 붙이기
pay_help['category_3'] = pay_help['category_id'].astype(str).str[:3]

pay_help = pd.merge(pay_help, df_category, left_on='category_3', right_on='category', how='left')

# drop columns
pay_help = pay_help.drop(['discussion_id', 'created_at', 'reporter_uid', 
                          'fraud_uid', 'category_3'], axis=1)

# 10만원 단위 구간
pay_help['price_range'] = (pay_help['total_price'] / 100000).apply(np.floor) * 100000

# 5만원 이하
pay_help['less_50,000'] = pay_help['total_price'].apply(lambda x: 1 if x <= 50000 else 0)

# 3만원 이하
pay_help['less_30,000'] = pay_help['total_price'].apply(lambda x: 1 if x <= 30000 else 0)


pay_help.tail()

Unnamed: 0,date,order_status_cd,buyer_uid,seller_uid,total_price,category_id,fraud_help,month_at,category,name,price_range,"less_50,000","less_30,000"
304368,2019-06-30,in_transit,9031429,5415988,111210,310010010,0.0,2019-06,310,여성의류,100000.0,0,0
304369,2019-06-30,cancelled,942104,6650642,457750,320080100,0.0,2019-06,320,남성의류,400000.0,0,0
304370,2019-06-30,in_transit,3903914,2653493,124600,910100004,0.0,2019-06,910,스타굿즈,100000.0,0,0
304371,2019-06-30,in_transit,5733409,4416259,132950,600700001,0.0,2019-06,600,디지털/가전,100000.0,0,0
304372,2019-06-30,refunded,3785052,6699706,152410,900320100,0.0,2019-06,900,도서/티켓/취미/애완,100000.0,0,0


### Aggregate

In [17]:
pay_help_result = pay_help.groupby(['month_at', 'category', 'name', 'order_status_cd', 
                                    'fraud_help', 'price_range', 'less_50,000', 'less_30,000'
                            ], as_index=False).agg({'total_price': 'sum', 'date': 'count'})


pay_help_result = pay_help_result.rename(columns={'total_price': 'price_sum', 'date': 'trx_count'})

# save
save_path = '../csv/result/pay_help_result.csv'
pay_help_result.to_csv(save_path, index=False, mode='w', header=True)

pay_help_result.tail()

Unnamed: 0,month_at,category,name,order_status_cd,fraud_help,price_range,"less_50,000","less_30,000",price_sum,trx_count
12268,2019-06,999,기타,vbank_expired,0.0,0.0,1,0,99420,3
12269,2019-06,999,기타,vbank_expired,0.0,0.0,1,1,48391,7
12270,2019-06,999,기타,vbank_expired,0.0,100000.0,0,0,746930,7
12271,2019-06,999,기타,waiting_bank_account_for_refund,0.0,0.0,0,0,155250,3
12272,2019-06,999,기타,waiting_bank_account_for_refund,0.0,0.0,1,0,31450,1
