In [2]:
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.api as sm
import numpy as np
import seaborn as sns
from scipy import stats
from datetime import datetime

### Raw 데이터 읽기

In [14]:
# 운영팀에서 관리하는 거래신고 접수 uid
read_path = '../csv/fraud_report_data_2019_2016.csv'
fraud_operation = pd.read_csv(read_path)

fraud_operation.columns = ['reported_date', 'product_name', 'price', 'reporter_uid', 'fraud_uid']

fraud_operation['block_cs'] = 1

# change type to str
fraud_operation['reporter_uid'] = fraud_operation['reporter_uid'].fillna(0).astype(int).astype(str)
fraud_operation['fraud_uid'] = fraud_operation['fraud_uid'].astype(str)

fraud_operation.tail()

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid,block_cs
15951,,갤럭시A7,125000,1370542,2667809,1
15952,,갤럭시노트fe,350000,3224068,4087003,1
15953,,아이폰5s,123000,4016158,4770542,1
15954,,핸드백,25000,938074,5335535,1
15955,,gtx1060 6g,250000,1890173,5699420,1


In [20]:
# 영구 제재 받은 uid list
read_path = '../csv/block_uid.csv'
fraud_uid_block = pd.read_csv(read_path)

fraud_uid_block['block_infinite'] = 1

# change type to str
fraud_uid_block['uid'] = fraud_uid_block['uid'].astype(str)

fraud_uid_block.tail()

Unnamed: 0,uid,block_infinite
143847,5834292,1
143848,5067006,1
143849,8755369,1
143850,5083812,1
143851,7961342,1


In [18]:
# 거래관련 상담센터 신고 list
read_path = '../csv/help_reporter_frauder.csv'
help_raw = pd.read_csv(read_path)

help_raw['fraud_help'] = 1

help_raw = help_raw.sort_values(['reporter_uid', 'fraud_uid', 'created_at'])  # sort again, ascending

help_raw = help_raw.drop_duplicates(['reporter_uid', 'fraud_uid'])  # drop duplicated row, keep 1st row

help_raw = help_raw.rename(columns={'created_at': 'created_at_help'})

# change type to str
for col in ['reporter_uid', 'fraud_uid']:
    
    df = help_raw
    
    df[col] = df[col].astype('str')
    

help_raw.tail()

Unnamed: 0,discussion_id,created_at_help,reporter_uid,fraud_uid,fraud_help
197015,838830,2019-07-05 07:56:30,9317104,0,1
196983,838715,2019-07-04 23:36:37,9317700,8767099,1
196782,837888,2019-07-03 21:12:13,9322294,0,1
196784,837893,2019-07-03 21:17:22,9322294,9320707,1
196876,838255,2019-07-04 13:55:10,9323927,4623330,1


In [30]:
help_raw['fraud_uid'].nunique()

103369

In [16]:
# 번프 전체 데이터
read_path = '../csv/bunp_history_all.csv'
bunp_raw = pd.read_csv(read_path)

bunp_raw = bunp_raw.rename(columns={'seller_pid_price': 'total_price'})

# 2017년 이후 번프 내역
bunp_raw_17 = bunp_raw

# make 'year' column
bunp_raw_17['year'] = bunp_raw_17['created_at'].astype(str).str[:4]
bunp_raw_17['year'] = bunp_raw_17['year'].astype(int)

bunp_raw_17 = bunp_raw_17[bunp_raw_17['year'] >= 2017]

# drop 'year' column
bunp_raw_17 = bunp_raw_17.drop(['year'], axis=1)

# change type to str
for col in ['buyer_uid', 'seller_uid']:
    
    df = bunp_raw_17
    
    df[col] = df[col].astype('str')

bunp_raw_17.tail()

Unnamed: 0,created_at,category,status,buyer_uid,seller_uid,total_price
14334118,2019-06-30 08:44:07,600400003.0,1,3164423,2941858,29900
14334119,2019-06-30 13:13:12,600700003.0,1,4658579,9264834,6000
14334120,2019-06-30 16:34:13,500113011.0,4,1296566,2775958,45000
14334121,2019-06-30 13:44:19,500119001.0,1,558255,6077583,20000
14334122,2019-06-30 12:58:22,800300001.0,1,9177490,4019525,64500


In [32]:
print(type(help_raw['created_at_help'][4]))

<class 'str'>


### Merge

In [19]:
# change type to str
for col in ['reporter_uid', 'fraud_uid']:
    
    df = help_raw
    
    df[col] = df[col].astype('str')

for col in ['buyer_uid', 'seller_uid']:
    
    df = bunp_raw_17
    
    df[col] = df[col].astype('str')

# merge
bunp_help = pd.merge(bunp_raw_17, help_raw, 
                            left_on=['buyer_uid', 'seller_uid'],
                            right_on=['reporter_uid', 'fraud_uid'],
                            how='left')


bunp_help['fraud_help'] = bunp_help['fraud_help'].fillna(0)

bunp_help['month_at'] = bunp_help['created_at'].astype(str).str[:7]
bunp_help = bunp_help.drop(['created_at', 'created_at_help', 'discussion_id', 'reporter_uid', 'fraud_uid'], axis=1)  # drop columns

bunp_help['price_range'] = (bunp_help['total_price'] / 100000).apply(np.floor) * 100000

bunp_help.tail()

Unnamed: 0,category,status,buyer_uid,seller_uid,total_price,fraud_help,month_at,price_range
14121676,600400003.0,1,3164423,2941858,29900,0.0,2019-06,0.0
14121677,600700003.0,1,4658579,9264834,6000,0.0,2019-06,0.0
14121678,500113011.0,4,1296566,2775958,45000,0.0,2019-06,0.0
14121679,500119001.0,1,558255,6077583,20000,0.0,2019-06,0.0
14121680,800300001.0,1,9177490,4019525,64500,0.0,2019-06,0.0


## black user와 white user의 이름 변경 횟수 T-Test

In [13]:
read_path = '../csv/changed_name_raw.csv'
fraud_name_raw = pd.read_csv(read_path)

# 2019년 7월 이름 변경 데이터 없애기
fraud_name_raw['change_name_date'] = pd.to_datetime(fraud_name_raw['change_name_date'])

fraud_name_raw['year'] = fraud_name_raw['change_name_date'].dt.year
fraud_name_raw['month'] = fraud_name_raw['change_name_date'].dt.month

fraud_name_raw = fraud_name_raw[(fraud_name_raw['year'] != 2019) |
                                (fraud_name_raw['month'] <= 6)]

fraud_name_raw = fraud_name_raw.drop(['year', 'month'], axis=1)


fraud_name_raw.tail()

Unnamed: 0,id,change_name_date,join_date
10859511,9300440,NaT,2019-06-29 23:59:06
10859512,9300441,NaT,2019-06-29 23:59:22
10859513,9300442,NaT,2019-06-29 23:59:31
10859514,9300444,NaT,2019-06-29 23:59:52
10859515,9300445,NaT,2019-06-30 00:00:00


In [21]:
fraud_name = fraud_name_raw.groupby(['uid'], as_index=False).agg({'date': 'count'})

fraud_name = fraud_name.rename(columns={'date': 'changed'})
fraud_name['uid'] = fraud_name['uid'].astype(str)

fraud_name.tail()

Unnamed: 0,uid,changed
4099311,9325552,1
4099312,9325572,1
4099313,9325578,1
4099314,9325579,1
4099315,9325602,1


In [6]:
fraud_name_uid = pd.merge(fraud_name, fraud_uid, on=['uid'], how='left')

fraud_name_uid = fraud_name_uid.drop(['black_source'], axis=1).fillna(0)

fraud_name_uid.tail()

Unnamed: 0,uid,changed,help,block,block_by_operation
4102690,9325552,1,0.0,0.0,0.0
4102691,9325572,1,0.0,0.0,0.0
4102692,9325578,1,0.0,0.0,0.0
4102693,9325579,1,0.0,0.0,0.0
4102694,9325602,1,0.0,0.0,0.0


### help, 상담센터 신고된 유저

In [7]:
fraud_name_uid_normal = fraud_name_uid[fraud_name_uid['help'] == 0]
fraud_name_uid_black = fraud_name_uid[fraud_name_uid['help'] == 1]

In [13]:
# 샘플 수 비교
np.size(fraud_name_uid_normal), np.size(fraud_name_uid_black)

(20022460, 491015)

In [14]:
# 평균 비교
np.mean(fraud_name_uid_normal['changed']), np.mean(fraud_name_uid_black['changed'])

(1.3953342895927874, 2.4676741036424548)

In [15]:
# 등분산 검정
leveneTest = stats.levene(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'])

leveneTest

LeveneResult(statistic=66411.07416078707, pvalue=0.0)

In [16]:
# T-test
tTestResult = stats.ttest_ind(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-124.22704575665718, pvalue=0.0)

### block, 영구 제재된 유저

In [17]:
fraud_name_uid_normal = fraud_name_uid[fraud_name_uid['block'] == 0]
fraud_name_uid_black = fraud_name_uid[fraud_name_uid['block'] == 1]

In [18]:
# 샘플 수 비교
np.size(fraud_name_uid_normal), np.size(fraud_name_uid_black)

(20218730, 294745)

In [19]:
# 평균 비교
np.mean(fraud_name_uid_normal['changed']), np.mean(fraud_name_uid_black['changed'])

(1.4138885083286636, 1.9089721623776486)

In [20]:
# 등분산 검정
leveneTest = stats.levene(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'])

leveneTest

LeveneResult(statistic=9065.690648394393, pvalue=0.0)

In [21]:
# T-test
tTestResult = stats.ttest_ind(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-63.14315728134901, pvalue=0.0)

### operation, 운영팀에서 처리한 유저

In [22]:
fraud_name_uid_normal = fraud_name_uid[fraud_name_uid['block_by_operation'] == 0]
fraud_name_uid_black = fraud_name_uid[fraud_name_uid['block_by_operation'] == 1]

In [23]:
# 샘플 수 비교
np.size(fraud_name_uid_normal), np.size(fraud_name_uid_black)

(20446575, 66900)

In [24]:
# 평균 비교
np.mean(fraud_name_uid_normal['changed']), np.mean(fraud_name_uid_black['changed'])

(1.4194663409397417, 1.8903587443946188)

In [25]:
# 등분산 검정
leveneTest = stats.levene(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'])

leveneTest

LeveneResult(statistic=1879.203176330235, pvalue=0.0)

In [26]:
# T-test
tTestResult = stats.ttest_ind(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-29.09745119728514, pvalue=1.471915339519034e-180)

## black user와 white user의 휴대폰 변경 횟수 T-Test

In [27]:
read_path = '../csv/changed_phone_raw.csv'
fraud_phone_raw = pd.read_csv(read_path)

fraud_phone_raw.tail()

Unnamed: 0,date,uid
434520,2019-07-04 11:54:52,5000124
434521,2019-07-04 11:59:15,9245231
434522,2019-07-04 11:59:15,7375046
434523,2019-07-04 11:59:45,9325895
434524,2019-07-04 11:59:45,5000124


In [28]:
fraud_phone = fraud_phone_raw.groupby(['uid'], as_index=False).agg({'date': 'count'})

fraud_phone = fraud_phone.rename(columns={'date': 'changed'})
fraud_phone['uid'] = fraud_phone['uid'].astype(str)

fraud_phone.tail()

Unnamed: 0,uid,changed
351970,9325547,1
351971,9325556,1
351972,9325757,1
351973,9325864,1
351974,9325895,1


In [29]:
fraud_phone_uid = pd.merge(fraud_phone, fraud_uid, on=['uid'], how='left')

fraud_phone_uid = fraud_phone_uid.drop(['black_source'], axis=1).fillna(0)

fraud_phone_uid.tail()

Unnamed: 0,uid,changed,help,block,block_by_operation
352755,9325547,1,0.0,0.0,0.0
352756,9325556,1,0.0,0.0,0.0
352757,9325757,1,0.0,0.0,0.0
352758,9325864,1,0.0,0.0,0.0
352759,9325895,1,0.0,0.0,0.0


### help, 상담센터 신고된 유저

In [30]:
fraud_phone_uid_normal = fraud_phone_uid[fraud_phone_uid['help'] == 0]
fraud_phone_uid_black = fraud_phone_uid[fraud_phone_uid['help'] == 1]

In [31]:
# 샘플 수 비교
np.size(fraud_phone_uid_normal), np.size(fraud_phone_uid_black)

(1671050, 92750)

In [32]:
# 평균 비교
np.mean(fraud_phone_uid_normal['changed']), np.mean(fraud_phone_uid_black['changed'])

(1.219724125549804, 1.5118059299191375)

In [33]:
# 등분산 검정
leveneTest = stats.levene(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'])

leveneTest

LeveneResult(statistic=2796.9433308772304, pvalue=0.0)

In [34]:
# T-test
tTestResult = stats.ttest_ind(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-32.8589930646798, pvalue=1.9010494159961004e-230)

### block, 영구 제재된 유저

In [35]:
fraud_phone_uid_normal = fraud_phone_uid[fraud_phone_uid['block'] == 0]
fraud_phone_uid_black = fraud_phone_uid[fraud_phone_uid['block'] == 1]

In [36]:
# 샘플 수 비교
np.size(fraud_phone_uid_normal), np.size(fraud_phone_uid_black)

(1718630, 45170)

In [37]:
# 평균 비교
np.mean(fraud_phone_uid_normal['changed']), np.mean(fraud_phone_uid_black['changed'])

(1.2290283539796234, 1.4654638034093426)

In [38]:
# 등분산 검정
leveneTest = stats.levene(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'])

leveneTest

LeveneResult(statistic=913.1073165499738, pvalue=2.5073199830925078e-200)

In [39]:
# T-test
tTestResult = stats.ttest_ind(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-23.524199133055664, pvalue=6.451233280996861e-119)

### operation, 운영팀에서 처리한 유저

In [40]:
fraud_phone_uid_normal = fraud_phone_uid[fraud_phone_uid['block_by_operation'] == 0]
fraud_phone_uid_black = fraud_phone_uid[fraud_phone_uid['block_by_operation'] == 1]

In [41]:
# 샘플 수 비교
np.size(fraud_phone_uid_normal), np.size(fraud_phone_uid_black)

(1750060, 13740)

In [42]:
# 평균 비교
np.mean(fraud_phone_uid_normal['changed']), np.mean(fraud_phone_uid_black['changed'])

(1.2333234289110087, 1.4592430858806404)

In [43]:
# 등분산 검정
leveneTest = stats.levene(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'])

leveneTest

LeveneResult(statistic=257.75457128903776, pvalue=5.553551371173039e-58)

In [44]:
# T-test
tTestResult = stats.ttest_ind(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-12.649150895428923, pvalue=1.073359514808459e-35)

## black user와 white user의 리뷰 작성 횟수 T-Test

In [45]:
read_path = '../csv/review_writer.csv'
fraud_review_writer_raw = pd.read_csv(read_path)

fraud_review_writer_raw.tail()

Unnamed: 0,time,writer_uid
5902957,2019-07-04 12:22:48,6439210
5902958,2019-07-04 12:23:38,4204619
5902959,2019-07-04 12:23:42,8994648
5902960,2019-07-04 12:23:47,1045672
5902961,2019-07-04 12:27:13,7810960


In [46]:
fraud_review_writer = fraud_review_writer_raw.groupby(['writer_uid'], as_index=False).agg({'time': 'count'})

fraud_review_writer = fraud_review_writer.rename(columns={'time': 'writed', 'writer_uid': 'uid'})
fraud_review_writer['uid'] = fraud_review_writer['uid'].astype(str)

fraud_review_writer.tail()

Unnamed: 0,uid,writed
1110787,9316109,1
1110788,9320668,1
1110789,9321110,1
1110790,9321727,1
1110791,9322225,1


In [47]:
fraud_review_writer_uid = pd.merge(fraud_review_writer, fraud_uid, on=['uid'], how='left')

fraud_review_writer_uid = fraud_review_writer_uid.drop(['black_source'], axis=1).fillna(0)

fraud_review_writer_uid.tail()

Unnamed: 0,uid,writed,help,block,block_by_operation
1112416,9316109,1,0.0,0.0,0.0
1112417,9320668,1,0.0,0.0,0.0
1112418,9321110,1,0.0,0.0,0.0
1112419,9321727,1,0.0,0.0,0.0
1112420,9322225,1,0.0,0.0,0.0


### help, 상담센터 신고된 유저

In [48]:
fraud_review_writer_uid_normal = fraud_review_writer_uid[fraud_review_writer_uid['help'] == 0]
fraud_review_writer_uid_black = fraud_review_writer_uid[fraud_review_writer_uid['help'] == 1]

In [49]:
# 샘플 수 비교
np.size(fraud_review_writer_uid_normal), np.size(fraud_review_writer_uid_black)

(5238555, 323550)

In [50]:
# 평균 비교
np.mean(fraud_review_writer_uid_normal['writed']), np.mean(fraud_review_writer_uid_black['writed'])

(4.822710652078674, 13.474748879616751)

In [51]:
# 등분산 검정
leveneTest = stats.levene(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'])

leveneTest

LeveneResult(statistic=15048.115826535262, pvalue=0.0)

In [52]:
# T-test
tTestResult = stats.ttest_ind(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-54.20249604555359, pvalue=0.0)

### block, 영구 제재된 유저

In [53]:
fraud_review_writer_uid_normal = fraud_review_writer_uid[fraud_review_writer_uid['block'] == 0]
fraud_review_writer_uid_black = fraud_review_writer_uid[fraud_review_writer_uid['block'] == 1]

In [54]:
# 샘플 수 비교
np.size(fraud_review_writer_uid_normal), np.size(fraud_review_writer_uid_black)

(5396740, 165365)

In [55]:
# 평균 비교
np.mean(fraud_review_writer_uid_normal['writed']), np.mean(fraud_review_writer_uid_black['writed'])

(5.2216662281303154, 8.731079732712484)

In [56]:
# 등분산 검정
leveneTest = stats.levene(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'])

leveneTest

LeveneResult(statistic=1203.5252309507368, pvalue=1.4476874595009652e-263)

In [58]:
# T-test
tTestResult = stats.ttest_ind(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-28.92924257275353, pvalue=8.06609605512334e-182)

### operation, 운영팀에서 처리한 유저

In [59]:
fraud_review_writer_uid_normal = fraud_review_writer_uid[fraud_review_writer_uid['block_by_operation'] == 0]
fraud_review_writer_uid_black = fraud_review_writer_uid[fraud_review_writer_uid['block_by_operation'] == 1]

In [60]:
# 샘플 수 비교
np.size(fraud_review_writer_uid_normal), np.size(fraud_review_writer_uid_black)

(5530980, 31125)

In [61]:
# 평균 비교
np.mean(fraud_review_writer_uid_normal['writed']), np.mean(fraud_review_writer_uid_black['writed'])

(5.298332302774553, 10.243212851405623)

In [62]:
# 등분산 검정
leveneTest = stats.levene(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'])

leveneTest

LeveneResult(statistic=502.5365131726051, pvalue=2.8236969570967546e-111)

In [63]:
# T-test
tTestResult = stats.ttest_ind(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'], equal_var=False)

tTestResult

Ttest_indResult(statistic=-15.137973808885832, pvalue=7.196922087061975e-51)