In [1]:
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.api as sm
import numpy as np
import seaborn as sns
from scipy import stats

### Raw 데이터 읽기

In [2]:
# 카테고리
read_path = '../csv/categories.csv'
df_category = pd.read_csv(read_path)

df_category = df_category[['category', 'name']]
df_category['category'] = df_category['category'].astype(str)

df_category.tail()

Unnamed: 0,category,name
1219,240060020,프로그래머
1220,240070040,설문조사
1221,240080060,유아유치원
1222,240100010,매장관리
1223,240100090,기타 업종


In [3]:
# 번프 전체 데이터
read_path = '../csv/bunp_history_all.csv'
bunp_raw = pd.read_csv(read_path)

bunp_raw = bunp_raw.rename(columns={'seller_pid_price': 'total_price'})


# 2017년 이후 번프 내역
bunp_raw_17 = bunp_raw

# make 'year' column
bunp_raw_17['year'] = bunp_raw_17['created_at'].astype(str).str[:4]
bunp_raw_17['year'] = bunp_raw_17['year'].astype(int)

bunp_raw_17 = bunp_raw_17[bunp_raw_17['year'] >= 2017]

# drop 'year' column
bunp_raw_17 = bunp_raw_17.drop(['year'], axis=1)

# change type to str
for col in ['buyer_uid', 'seller_uid']:
    
    df = bunp_raw_17
    
    df[col] = df[col].astype('str')

bunp_raw_17.tail()

Unnamed: 0,created_at,category,status,buyer_uid,seller_uid,total_price
14334118,2019-06-30 08:44:07,600400003.0,1,3164423,2941858,29900
14334119,2019-06-30 13:13:12,600700003.0,1,4658579,9264834,6000
14334120,2019-06-30 16:34:13,500113011.0,4,1296566,2775958,45000
14334121,2019-06-30 13:44:19,500119001.0,1,558255,6077583,20000
14334122,2019-06-30 12:58:22,800300001.0,1,9177490,4019525,64500


In [4]:
# 거래관련 상담센터 신고 list
read_path = '../csv/help_reporter_frauder.csv'
help_raw = pd.read_csv(read_path)

help_raw['fraud_help'] = 1

help_raw = help_raw.sort_values(['reporter_uid', 'fraud_uid', 'created_at'])  # sort again, ascending

help_raw = help_raw.drop_duplicates(['reporter_uid', 'fraud_uid'])  # drop duplicated row, keep 1st row

help_raw = help_raw.rename(columns={'created_at': 'created_at_help'})

help_raw['month_at'] = help_raw['created_at_help'].astype(str).str[:7]

# change type to str
for col in ['reporter_uid', 'fraud_uid']:
    
    df = help_raw
    
    df[col] = df[col].astype('str')
    

help_raw.tail()

Unnamed: 0,discussion_id,created_at_help,reporter_uid,fraud_uid,fraud_help,month_at
197015,838830,2019-07-05 07:56:30,9317104,0,1,2019-07
196983,838715,2019-07-04 23:36:37,9317700,8767099,1,2019-07
196782,837888,2019-07-03 21:12:13,9322294,0,1,2019-07
196784,837893,2019-07-03 21:17:22,9322294,9320707,1,2019-07
196876,838255,2019-07-04 13:55:10,9323927,4623330,1,2019-07


### Merge

In [24]:
# merge
bunp_help = pd.merge(bunp_raw_17, help_raw, 
                            left_on=['buyer_uid', 'seller_uid'],
                            right_on=['reporter_uid', 'fraud_uid'],
                            how='left')


bunp_help['fraud_help'] = bunp_help['fraud_help'].fillna(0)

bunp_help['month_at'] = bunp_help['created_at'].astype(str).str[:7]
bunp_help = bunp_help.drop(['created_at', 'created_at_help', 'discussion_id', 'reporter_uid', 'fraud_uid'], axis=1)  # drop columns

# 1-depth 카테고리 붙이기
bunp_help['category_3'] = bunp_help['category'].astype(str).str[:3]

bunp_help = pd.merge(bunp_help, df_category, left_on='category_3', right_on='category', how='left')

bunp_help = bunp_help.drop(['category_x', 'category_y'], axis=1)  # drop columns

# 10만원 단위 구간
bunp_help['price_range'] = (bunp_help['total_price'] / 100000).apply(np.floor) * 100000

# 5만원 이하
bunp_help['less_50,000'] = bunp_help['total_price'].apply(lambda x: 1 if x <= 50000 else 0)

# 3만원 이하
bunp_help['less_30,000'] = bunp_help['total_price'].apply(lambda x: 1 if x <= 30000 else 0)

bunp_help.tail()

Unnamed: 0,status,buyer_uid,seller_uid,total_price,fraud_help,month_at,year,month,category_3,name,price_range,"less_50,000","less_30,000"
14121676,1,3164423,2941858,29900,0.0,2019-06,,,600,디지털/가전,0.0,1,1
14121677,1,4658579,9264834,6000,0.0,2019-06,,,600,디지털/가전,0.0,1,1
14121678,4,1296566,2775958,45000,0.0,2019-06,,,500,유아동/출산,0.0,1,0
14121679,1,558255,6077583,20000,0.0,2019-06,,,500,유아동/출산,0.0,1,1
14121680,1,9177490,4019525,64500,0.0,2019-06,,,800,생활/문구/가구/식품,0.0,0,0


### Aggregate

In [26]:
bunp_help_result = bunp_help.groupby(['month_at', 'category_3', 'name', 'fraud_help', 'price_range', 
                                      'less_50,000', 'less_30,000'
                            ], as_index=False).agg({'total_price': 'sum', 'buyer_uid': 'count'})

bunp_help_result = bunp_help_result.rename(columns={'total_price': 'price_sum', 'buyer_uid': 'trx_count'})

# save
save_path = '../csv/result/bunp_help_result.csv'
bunp_help_result.to_csv(save_path, index=False, mode='w', header=True)

bunp_help_result.tail()

Unnamed: 0,month_at,category_3,name,fraud_help,price_range,"less_50,000","less_30,000",price_sum,trx_count
16968,2019-06,999,기타,1.0,100000.0,0,0,765000,6
16969,2019-06,999,기타,1.0,200000.0,0,0,600000,3
16970,2019-06,999,기타,1.0,400000.0,0,0,490000,1
16971,2019-06,999,기타,1.0,500000.0,0,0,1550000,3
16972,2019-06,999,기타,1.0,800000.0,0,0,850000,1


### 상담센터에 신고된 건 중 번프에 매칭되는 비율

In [27]:
# 차집합 구하기
help_minus_bunp = pd.merge(bunp_raw_17, help_raw, 
                            left_on=['buyer_uid', 'seller_uid'],
                            right_on=['reporter_uid', 'fraud_uid'],
                            how='outer', indicator=True)

help_minus_bunp = help_minus_bunp[(help_minus_bunp['_merge']=='both') |
                                  (help_minus_bunp['_merge']=='right_only')]

help_minus_bunp = help_minus_bunp.reset_index()[['discussion_id', 
                                                 'created_at', 
                                                 'reporter_uid', 
                                                 'fraud_uid', 
                                                 'fraud_help', 
                                                 'month_at', 
                                                 'total_price',
                                                 '_merge']] 

# 5만원 이하
help_minus_bunp['less_50,000'] = help_minus_bunp['total_price'].apply(lambda x: 1 if x <= 50000 else 0)

# 3만원 이하
help_minus_bunp['less_30,000'] = help_minus_bunp['total_price'].apply(lambda x: 1 if x <= 30000 else 0)

# replace_values
replace_values = {'botn': 'merged', 'right_only': 'missed'}
help_minus_bunp = help_minus_bunp.replace({'_merge' : replace_values})

help_minus_bunp.tail()

Unnamed: 0,discussion_id,created_at,reporter_uid,fraud_uid,fraud_help,month_at,total_price,_merge,"less_50,000","less_30,000"
172318,838830.0,,9317104,0,1.0,2019-07,,missed,0,0
172319,838715.0,,9317700,8767099,1.0,2019-07,,missed,0,0
172320,837888.0,,9322294,0,1.0,2019-07,,missed,0,0
172321,837893.0,,9322294,9320707,1.0,2019-07,,missed,0,0
172322,838255.0,,9323927,4623330,1.0,2019-07,,missed,0,0


In [28]:
help_minus_bunp_result = help_minus_bunp.groupby(['month_at', '_merge', 'less_50,000', 'less_30,000'
                            ], as_index=False).agg({'discussion_id': 'count'})

help_minus_bunp_result = help_minus_bunp_result.rename(columns={'discussion_id': 'help_count'})

# save
save_path = '../csv/result/help_minus_bunp_result.csv'
help_minus_bunp_result.to_csv(save_path, index=False, mode='w', header=True)


help_minus_bunp_result.tail()

Unnamed: 0,month_at,_merge,"less_50,000","less_30,000",help_count
177,2019-06,missed,0,0,2556
178,2019-07,both,0,0,95
179,2019-07,both,1,0,31
180,2019-07,both,1,1,69
181,2019-07,missed,0,0,513


In [13]:
help_minus_bunp[help_minus_bunp['less_50,000'] == 1].tail(10)

Unnamed: 0,discussion_id,created_at,reporter_uid,fraud_uid,fraud_help,month_at,total_price,_merge,"less_50,000"
51787,821467.0,2019-06-09 15:40:01,3130678,5023750,1.0,2019-06,38000.0,both,1
51794,831196.0,2019-06-24 01:53:13,9079257,711402,1.0,2019-06,28000.0,both,1
51801,831357.0,2019-06-20 18:20:32,6699172,384259,1.0,2019-06,19000.0,both,1
51802,831357.0,2019-06-20 18:20:49,6699172,384259,1.0,2019-06,22000.0,both,1
51803,833355.0,2019-06-21 00:38:02,176271,7843725,1.0,2019-06,2500.0,both,1
51804,834253.0,2019-06-21 11:31:54,3719900,7188891,1.0,2019-06,7000.0,both,1
51805,832951.0,2019-06-22 16:34:06,4100416,1827202,1.0,2019-06,10000.0,both,1
51807,837942.0,2019-06-26 19:03:34,1868680,6835747,1.0,2019-07,23000.0,both,1
51808,834967.0,2019-06-28 08:31:20,9127162,2870085,1.0,2019-06,30000.0,both,1
51809,834956.0,2019-06-29 14:05:06,4853073,6483740,1.0,2019-06,40000.0,both,1


### 상담센터에 신고된 유저 중 영구 제재된 유저 비율

In [17]:
# 영구 제재 받은 uid list
read_path = '../csv/block_uid.csv'
fraud_uid_block = pd.read_csv(read_path)

fraud_uid_block['block_infinite'] = 1

# change type to str
fraud_uid_block['uid'] = fraud_uid_block['uid'].astype(str)

fraud_uid_block.tail()

Unnamed: 0,uid,block_infinite
143847,5834292,1
143848,5067006,1
143849,8755369,1
143850,5083812,1
143851,7961342,1


In [18]:
# 2017년 1월 ~ 2019년 6월 상담센터 신고 내역
help_raw_17 = help_raw


help_raw_17['created_at_help'] = pd.to_datetime(help_raw_17['created_at_help'])

help_raw_17['year'] = help_raw_17['created_at_help'].dt.year
help_raw_17['month'] = help_raw_17['created_at_help'].dt.month

help_raw_17 = help_raw_17[help_raw_17['year'] >= 2017]

help_raw_17 = help_raw_17[(help_raw_17['year'] != 2019) | (help_raw_17['month'] <= 6)]

help_raw_17 = help_raw_17.drop(['year', 'month'], axis=1)

help_raw_17.tail()

Unnamed: 0,discussion_id,created_at_help,reporter_uid,fraud_uid,fraud_help,month_at
195988,834704,2019-06-28 23:35:47,9281092,7344857,1,2019-06
195764,833830,2019-06-27 18:26:33,9282335,3893252,1,2019-06
196064,834966,2019-06-29 14:45:24,9283571,7165014,1,2019-06
196052,834921,2019-06-29 13:32:34,9290388,undefined,1,2019-06
196214,835513,2019-06-30 17:59:49,9300550,8733407,1,2019-06


In [20]:
help_block = pd.merge(help_raw_17, fraud_uid_block, 
                     left_on='fraud_uid', right_on='uid',
                     how='left')

help_block['block_infinite'] = help_block['block_infinite'].fillna(0)

help_block = help_block[['fraud_uid', 'block_infinite']]

help_block = help_block.sort_values(['fraud_uid', 'block_infinite'])  # sort again, ascending

help_block = help_block.drop_duplicates(['fraud_uid', 'block_infinite'])  # drop duplicated row, keep 1st row

help_block.tail()

Unnamed: 0,fraud_uid,block_infinite
34676,999450,1.0
17237,999606,1.0
48732,999920,0.0
93495,999935,0.0
600,undefined,0.0


In [22]:
help_block_result = help_block.groupby(['block_infinite'], as_index=False).agg({'fraud_uid': 'count'})

help_block_result = help_block_result.rename(columns={'fraud_uid': 'fraud_count'})

# save
save_path = '../csv/result/help_block_result.csv'
help_block_result.to_csv(save_path, index=False, mode='w', header=True)


help_block_result.tail()

Unnamed: 0,block_infinite,fraud_count
0,0.0,34606
1,1.0,22456
