In [11]:
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.api as sm
import numpy as np
import seaborn as sns
from scipy import stats

### Raw 데이터 읽기

In [30]:
# 카테고리
read_path = '../csv/categories.csv'
df_category = pd.read_csv(read_path)

df_category['category'] = df_category['category'].astype(str)
df_category = df_category[['category', 'name']]

df_category.tail()

Unnamed: 0,category,name
1219,240060020,프로그래머
1220,240070040,설문조사
1221,240080060,유아유치원
1222,240100010,매장관리
1223,240100090,기타 업종


In [37]:
# 번프 전체 데이터
read_path = '../csv/bunp_history_all.csv'
bunp_raw = pd.read_csv(read_path)

bunp_raw = bunp_raw.rename(columns={'seller_pid_price': 'total_price'})

bunp_raw.tail()

Unnamed: 0,created_at,category,status,buyer_uid,seller_uid,total_price
14334118,2019-06-30 08:44:07,600400003.0,1,3164423,2941858,29900
14334119,2019-06-30 13:13:12,600700003.0,1,4658579,9264834,6000
14334120,2019-06-30 16:34:13,500113011.0,4,1296566,2775958,45000
14334121,2019-06-30 13:44:19,500119001.0,1,558255,6077583,20000
14334122,2019-06-30 12:58:22,800300001.0,1,9177490,4019525,64500


In [38]:
# 2017년 이후 번프 내역
bunp_raw_17 = bunp_raw

bunp_raw_17['year'] = bunp_raw_17['created_at'].astype(str).str[:4]
bunp_raw_17['year'] = bunp_raw_17['year'].astype(int)

bunp_raw_17 = bunp_raw_17[bunp_raw_17['year'] >= 2017]

# drop columns
bunp_raw_17 = bunp_raw_17.drop(['year'], axis=1)

bunp_raw_17.tail()

Unnamed: 0,created_at,category,status,buyer_uid,seller_uid,total_price
14334118,2019-06-30 08:44:07,600400003.0,1,3164423,2941858,29900
14334119,2019-06-30 13:13:12,600700003.0,1,4658579,9264834,6000
14334120,2019-06-30 16:34:13,500113011.0,4,1296566,2775958,45000
14334121,2019-06-30 13:44:19,500119001.0,1,558255,6077583,20000
14334122,2019-06-30 12:58:22,800300001.0,1,9177490,4019525,64500


In [43]:
# 거래관련 상담센터 신고 list
read_path = '../csv/help_reporter_frauder.csv'
help_raw = pd.read_csv(read_path)

help_raw['fraud_help'] = 1

help_raw = help_raw.sort_values(['reporter_uid', 'fraud_uid', 'created_at'])  # sort again, ascending

help_raw = help_raw.drop_duplicates(['reporter_uid', 'fraud_uid'])  # drop duplicated row, keep 1st row

help_raw = help_raw.rename(columns={'created_at': 'created_at_help'})

help_raw.tail()

Unnamed: 0,discussion_id,created_at_help,reporter_uid,fraud_uid,fraud_help
197015,838830,2019-07-05 07:56:30,9317104,0,1
196983,838715,2019-07-04 23:36:37,9317700,8767099,1
196782,837888,2019-07-03 21:12:13,9322294,0,1
196784,837893,2019-07-03 21:17:22,9322294,9320707,1
196876,838255,2019-07-04 13:55:10,9323927,4623330,1


### Merge

In [45]:
# change type to str
for col in ['reporter_uid', 'fraud_uid']:
    
    df = help_raw
    
    df[col] = df[col].astype('str')

for col in ['buyer_uid', 'seller_uid']:
    
    df = bunp_raw_17
    
    df[col] = df[col].astype('str')

# merge
bunp_help = pd.merge(bunp_raw_17, help_raw, 
                            left_on=['buyer_uid', 'seller_uid'],
                            right_on=['reporter_uid', 'fraud_uid'],
                            how='left')


bunp_help['fraud_help'] = bunp_help['fraud_help'].fillna(0)

bunp_help['month_at'] = bunp_help['created_at'].astype(str).str[:7]
bunp_help = bunp_help.drop(['created_at', 'created_at_help', 'discussion_id', 'reporter_uid', 'fraud_uid'], axis=1)  # drop columns

# 1-depth 카테고리 붙이기
bunp_help['category_3'] = bunp_help['category'].astype(str).str[:3]

bunp_help = pd.merge(bunp_help, df_category, left_on='category_3', right_on='category', how='left')

bunp_help = bunp_help.drop(['category_x', 'category_y'], axis=1)  # drop columns

bunp_help['price_range'] = (bunp_help['total_price'] / 100000).apply(np.floor) * 100000

bunp_help.tail()

Unnamed: 0,status,buyer_uid,seller_uid,total_price,fraud_help,month_at,category_3,name,price_range
14121676,1,3164423,2941858,29900,0.0,2019-06,600,디지털/가전,0.0
14121677,1,4658579,9264834,6000,0.0,2019-06,600,디지털/가전,0.0
14121678,4,1296566,2775958,45000,0.0,2019-06,500,유아동/출산,0.0
14121679,1,558255,6077583,20000,0.0,2019-06,500,유아동/출산,0.0
14121680,1,9177490,4019525,64500,0.0,2019-06,800,생활/문구/가구/식품,0.0


### Aggregate

In [46]:
bunp_help_result = bunp_help.groupby(['month_at', 'category_3', 'name', 'price_range', 'fraud_help'
                            ], as_index=False).agg({'total_price': 'sum', 'buyer_uid': 'count'})

bunp_help_result = bunp_help_result.rename(columns={'total_price': 'price_sum', 'buyer_uid': 'trx_count'})

# save
save_path = '../csv/tmp/bunp_help_result.csv'
bunp_help_result.to_csv(save_path, index=False, mode='w', header=True)

bunp_help_result.tail()

Unnamed: 0,month_at,category_3,name,price_range,fraud_help,price_sum,trx_count
15263,2019-06,999,기타,53800000.0,0.0,53869868,1
15264,2019-06,999,기타,100000000.0,0.0,200000000,2
15265,2019-06,999,기타,111100000.0,0.0,555555555,5
15266,2019-06,999,기타,999900000.0,0.0,3999999996,4
15267,2019-06,999,기타,4294900000.0,0.0,8589934590,2


### help_raw - bunp_raw_17

In [58]:
# 차집합 구하기
help_minus_bunp = pd.merge(bunp_raw_17, help_raw, 
                            left_on=['buyer_uid', 'seller_uid'],
                            right_on=['reporter_uid', 'fraud_uid'],
                            how='outer', indicator=True)

help_minus_bunp = help_minus_bunp[(help_minus_bunp['_merge']=='both') |
                                  (help_minus_bunp['_merge']=='right_only')]

help_minus_bunp = help_minus_bunp.reset_index()[['discussion_id', 
                                                 'created_at', 
                                                 'reporter_uid', 
                                                 'fraud_uid', 
                                                 'fraud_help', 
                                                 'month_at', 
                                                 '_merge']] 

replace_values = {'botn': 'merged', 'right_only': 'missed'}
help_minus_bunp = help_minus_bunp.replace({'_merge' : replace_values})

help_minus_bunp.tail()

Unnamed: 0,discussion_id,created_at,reporter_uid,fraud_uid,fraud_help,month_at,_merge
172318,838830.0,,9317104,0,1.0,2019-07,missed
172319,838715.0,,9317700,8767099,1.0,2019-07,missed
172320,837888.0,,9322294,0,1.0,2019-07,missed
172321,837893.0,,9322294,9320707,1.0,2019-07,missed
172322,838255.0,,9323927,4623330,1.0,2019-07,missed


In [59]:
help_minus_bunp_result = help_minus_bunp.groupby(['month_at', '_merge'
                            ], as_index=False).agg({'discussion_id': 'count'})

# save
save_path = '../csv/tmp/help_minus_bunp_result.csv'
help_minus_bunp_result.to_csv(save_path, index=False, mode='w', header=True)


help_minus_bunp_result.tail()

Unnamed: 0,month_at,_merge,discussion_id
103,2019-05,missed,2505
104,2019-06,both,2048
105,2019-06,missed,2556
106,2019-07,both,195
107,2019-07,missed,513
