In [35]:
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.api as sm
import numpy as np
import seaborn as sns
from scipy import stats

### Raw 데이터 읽기

In [36]:
# 카테고리
read_path = '../csv/categories.csv'
df_category = pd.read_csv(read_path)

df_category['category'] = df_category['category'].astype(str)
df_category = df_category[['category', 'name']]

df_category.tail()

Unnamed: 0,category,name
1219,240060020,프로그래머
1220,240070040,설문조사
1221,240080060,유아유치원
1222,240100010,매장관리
1223,240100090,기타 업종


In [37]:
# 번프 완료 데이터
read_path = '../csv/bunp_history.csv'
bunp_raw = pd.read_csv(read_path)

bunp_raw = bunp_raw.rename(columns={'seller_pid_price': 'total_price'})

bunp_raw.tail()

Unnamed: 0,updated_at,category,buyer_uid,seller_uid,total_price
10933687,2019-07-02 09:22:43,600700002,428300,9139471,20000
10933688,2019-07-02 10:07:24,900220999,4583984,3727301,24000
10933689,2019-07-02 11:42:04,700100100,9303879,5499956,80000
10933690,2019-07-02 13:10:52,400010100,8693877,5293606,38000
10933691,2019-07-02 13:15:25,600700999,5696564,9138584,70000


In [38]:
# 2017년 이후 번프 내역
bunp_raw_17 = bunp_raw

bunp_raw_17['year'] = bunp_raw_17['updated_at'].astype(str).str[:4]
bunp_raw_17['year'] = bunp_raw_17['year'].astype(int)

bunp_raw_17 = bunp_raw_17[bunp_raw_17['year'] >= 2017]

# drop columns
bunp_raw_17 = bunp_raw_17.drop(['year'], axis=1)

bunp_raw_17.tail()

Unnamed: 0,updated_at,category,buyer_uid,seller_uid,total_price
10933687,2019-07-02 09:22:43,600700002,428300,9139471,20000
10933688,2019-07-02 10:07:24,900220999,4583984,3727301,24000
10933689,2019-07-02 11:42:04,700100100,9303879,5499956,80000
10933690,2019-07-02 13:10:52,400010100,8693877,5293606,38000
10933691,2019-07-02 13:15:25,600700999,5696564,9138584,70000


In [39]:
# 거래관련 상담센터 신고 list
read_path = '../csv/help_reporter_frauder.csv'
help_raw = pd.read_csv(read_path)

help_raw['fraud_help'] = 1

help_raw = help_raw.sort_values(['reporter_uid', 'fraud_uid', 'created_at'])  # sort again, ascending

help_raw = help_raw.drop_duplicates(['reporter_uid', 'fraud_uid'])  # drop duplicated row, keep 1st row

help_raw.tail()

Unnamed: 0,discussion_id,created_at,reporter_uid,fraud_uid,fraud_help
197015,838830,2019-07-05 07:56:30,9317104,0,1
196983,838715,2019-07-04 23:36:37,9317700,8767099,1
196782,837888,2019-07-03 21:12:13,9322294,0,1
196784,837893,2019-07-03 21:17:22,9322294,9320707,1
196876,838255,2019-07-04 13:55:10,9323927,4623330,1


### Merge

In [40]:
# change type to str
for col in ['reporter_uid', 'fraud_uid']:
    
    df = help_raw
    
    df[col] = df[col].astype('str')

for col in ['buyer_uid', 'seller_uid']:
    
    df = bunp_raw_17
    
    df[col] = df[col].astype('str')

# merge
bunp_help = pd.merge(bunp_raw_17, help_raw, 
                            left_on=['buyer_uid', 'seller_uid'],
                            right_on=['reporter_uid', 'fraud_uid'],
                            how='left')


bunp_help['fraud_help'] = bunp_help['fraud_help'].fillna(0)

bunp_help['month_at'] = bunp_help['updated_at'].astype(str).str[:7]
bunp_help = bunp_help.drop(['updated_at', 'created_at', 'discussion_id', 'reporter_uid', 'fraud_uid'], axis=1)  # drop columns

# 1-depth 카테고리 붙이기
bunp_help['category_3'] = bunp_help['category'].astype(str).str[:3]

bunp_help = pd.merge(bunp_help, df_category, left_on='category_3', right_on='category', how='left')

bunp_help = bunp_help.drop(['category_x', 'category_y'], axis=1)  # drop columns

bunp_help.tail()

Unnamed: 0,buyer_uid,seller_uid,total_price,fraud_help,month_at,category_3,name
10837553,428300,9139471,20000,0.0,2019-07,600,디지털/가전
10837554,4583984,3727301,24000,0.0,2019-07,900,도서/티켓/취미/애완
10837555,9303879,5499956,80000,0.0,2019-07,700,스포츠/레저
10837556,8693877,5293606,38000,0.0,2019-07,400,패션잡화
10837557,5696564,9138584,70000,0.0,2019-07,600,디지털/가전


In [41]:
bunp_help['price_range'] = (bunp_help['total_price'] / 100000).apply(np.floor) * 100000

### Aggregate

In [43]:
bunp_help_result = bunp_help.groupby(['month_at', 'category_3', 'name', 'price_range', 'fraud_help'
                            ], as_index=False).agg({'total_price': 'sum', 'buyer_uid': 'count'})

bunp_help_result = bunp_help_result.rename(columns={'total_price': 'price_sum', 'buyer_uid': 'trx_count'})

# save
save_path = '../csv/tmp/bunp_help_result.csv'
bunp_help_result.to_csv(save_path, index=False, mode='w', header=True)

bunp_help_result.tail()

Unnamed: 0,month_at,category_3,name,price_range,fraud_help,price_sum,trx_count
12150,2019-07,999,기타,300000.0,0.0,1008000,3
12151,2019-07,999,기타,400000.0,0.0,3426000,7
12152,2019-07,999,기타,1000000.0,0.0,1023560,1
12153,2019-07,999,기타,1600000.0,0.0,1600000,1
12154,2019-07,999,기타,3500000.0,0.0,3500000,1
