In [13]:
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.api as sm
import numpy as np
import seaborn as sns
from scipy import stats

## 사기 혐의자 데이터 읽기

In [14]:
# 상담센터에 거래 관련 신고된 uid
read_path = '../csv/help_uid.csv'
fraud_uid_help = pd.read_csv(read_path)

fraud_uid_help = fraud_uid_help.rename(columns={'value': 'uid'})

fraud_uid_help['uid'] = fraud_uid_help['uid'].astype(str)

fraud_uid_help['help'] = 1

fraud_uid_help.tail()

Unnamed: 0,uid,help
103113,1475933,1
103114,9313382,1
103115,9302282,1
103116,6223410,1
103117,9241528,1


In [15]:
# 영구 제재 받은 uid
read_path = '../csv/block_uid.csv'
fraud_uid_block = pd.read_csv(read_path)

fraud_uid_block['block'] = 1

fraud_uid_block['uid'] = fraud_uid_block['uid'].astype(str)

fraud_uid_block.tail()

Unnamed: 0,uid,block
143847,5834292,1
143848,5067006,1
143849,8755369,1
143850,5083812,1
143851,7961342,1


In [16]:
# 운영팀에서 관리하는 거래신고 접수 uid
read_path = '../csv/fraud_report_data_2019_2016.csv'
fraud_operation = pd.read_csv(read_path)

fraud_operation.columns = ['reported_date', 'product_name', 'price', 'reporter_uid', 'fraud_uid']

fraud_operation['block_by_operation'] = 1

fraud_operation.tail()

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid,block_by_operation
15951,,갤럭시A7,125000,1370542.0,2667809,1
15952,,갤럭시노트fe,350000,3224068.0,4087003,1
15953,,아이폰5s,123000,4016158.0,4770542,1
15954,,핸드백,25000,938074.0,5335535,1
15955,,gtx1060 6g,250000,1890173.0,5699420,1


### 사기 혐의자 데이터 합치기

In [17]:
fraud_uid_operation = fraud_operation[['fraud_uid', 'block_by_operation']]

fraud_uid_operation = fraud_uid_operation.rename(columns={'fraud_uid': 'uid'})

fraud_uid_operation['uid'] = fraud_uid_operation['uid'].astype(str)

fraud_uid_operation.tail()

Unnamed: 0,uid,block_by_operation
15951,2667809,1
15952,4087003,1
15953,4770542,1
15954,5335535,1
15955,5699420,1


In [18]:
fraud_uid_1 = pd.merge(fraud_uid_help, fraud_uid_block, on='uid', how='outer')

fraud_uid = pd.merge(fraud_uid_1, fraud_uid_operation, on='uid', how='outer')

fraud_uid = fraud_uid.fillna(0)

fraud_uid.tail()

Unnamed: 0,uid,help,block,block_by_operation
212508,6534153,0.0,0.0,1.0
212509,2610325,0.0,0.0,1.0
212510,3927135,0.0,0.0,1.0
212511,1806746,0.0,0.0,1.0
212512,8970553,0.0,0.0,1.0


In [19]:
def merge_block(row):
    if row['help'] == 1 and row['block'] == 1 and row['block_by_operation'] == 1:
        return 'help&block&operation'
    elif row['help'] == 1 and row['block'] == 1:
        return 'help&block'
    elif row['block'] == 1 and row['block_by_operation'] == 1:
        return 'block&operation'
    elif row['help'] == 1 and row['block_by_operation'] == 1:
        return 'help&operation'
    elif row['help'] == 1:
        return 'help'
    elif row['block'] == 1:
        return 'block'
    elif row['block_by_operation'] == 1:
        return 'operation'
    else:
        return 'missing'
    
fraud_uid['black_source'] = fraud_uid.apply(merge_block, axis=1)

fraud_uid.tail()

Unnamed: 0,uid,help,block,block_by_operation,black_source
212508,6534153,0.0,0.0,1.0,operation
212509,2610325,0.0,0.0,1.0,operation
212510,3927135,0.0,0.0,1.0,operation
212511,1806746,0.0,0.0,1.0,operation
212512,8970553,0.0,0.0,1.0,operation


In [20]:
fraud_uid.groupby(['black_source'], as_index=False).agg({'uid':'count'})

Unnamed: 0,black_source,uid
0,block,103892
1,block&operation,826
2,help,63073
3,help&block,29592
4,help&block&operation,12952
5,help&operation,1005
6,operation,1173


## 거래 데이터 읽기

In [21]:
read_path = '../csv/pay_history.csv'
pay_raw = pd.read_csv(read_path)

pay_raw = pay_raw.rename(columns={'buyer_id': 'buyer_uid', 'seller_id': 'seller_uid', 'deposit_done_date': 'updated_at'})

pay_raw.tail()

Unnamed: 0,updated_at,buyer_uid,seller_uid,total_price
90146,2019-01-25 20:32:45,4161761,4414597,148170
90147,2019-02-01 20:31:30,603779,5071323,31450
90148,2019-01-25 20:32:45,8069149,7742258,47350
90149,2019-01-25 20:32:45,31040,2459314,143100
90150,2019-01-25 20:32:45,7526483,7566393,2576000


In [210]:
read_path = '../csv/bunp_history.csv'
bunp_raw = pd.read_csv(read_path)

bunp_raw = bunp_raw.rename(columns={'seller_pid_price': 'total_price'})

bunp_raw.tail()

Unnamed: 0,updated_at,category,buyer_uid,seller_uid,total_price
10933687,2019-07-02 09:22:43,600700002,428300,9139471,20000
10933688,2019-07-02 10:07:24,900220999,4583984,3727301,24000
10933689,2019-07-02 11:42:04,700100100,9303879,5499956,80000
10933690,2019-07-02 13:10:52,400010100,8693877,5293606,38000
10933691,2019-07-02 13:15:25,600700999,5696564,9138584,70000


In [23]:
transaction = pd.concat([bunp_raw, pay_raw], axis=0).reset_index()

transaction['seller_uid'] = transaction['seller_uid'].astype(str)

transaction.tail()

Unnamed: 0,index,updated_at,buyer_uid,seller_uid,total_price
11023854,90146,2019-01-25 20:32:45,4161761,4414597,148170
11023855,90147,2019-02-01 20:31:30,603779,5071323,31450
11023856,90148,2019-01-25 20:32:45,8069149,7742258,47350
11023857,90149,2019-01-25 20:32:45,31040,2459314,143100
11023858,90150,2019-01-25 20:32:45,7526483,7566393,2576000


In [24]:
transaction_black = pd.merge(transaction, fraud_uid[['uid', 'black_source']], 
                             left_on=['seller_uid'], right_on=['uid'], how='left')

transaction_black.tail()

Unnamed: 0,index,updated_at,buyer_uid,seller_uid,total_price,uid,black_source
11179561,90146,2019-01-25 20:32:45,4161761,4414597,148170,4414597.0,help
11179562,90147,2019-02-01 20:31:30,603779,5071323,31450,,
11179563,90148,2019-01-25 20:32:45,8069149,7742258,47350,,
11179564,90149,2019-01-25 20:32:45,31040,2459314,143100,,
11179565,90150,2019-01-25 20:32:45,7526483,7566393,2576000,,


In [199]:
# 카테고리

read_path = '../csv/categories.csv'
df_category = pd.read_csv(read_path)

df_category['category'] = df_category['category'].astype(str)

df_category = df_category[['category', 'name']]

df_category.tail()

Unnamed: 0,category,name
1219,240060020,프로그래머
1220,240070040,설문조사
1221,240080060,유아유치원
1222,240100010,매장관리
1223,240100090,기타 업종


### 전체 번프 대비 사기 유형별 금액, 건 수 비율

In [212]:
bunp_raw['category_3'] = bunp_raw['category'].astype(str).str[:3]

In [213]:
bunp_raw['seller_uid'] = bunp_raw['seller_uid'].astype(str)

bunp_fraud_1 = pd.merge(bunp_raw, fraud_uid, 
                             left_on=['seller_uid'], right_on=['uid'], how='left')

bunp_fraud_1['category'] = bunp_fraud_1['category'].astype(str)

bunp_fraud = pd.merge(bunp_fraud_1, df_category, left_on='category_3', right_on='category', how='left')

bunp_fraud = bunp_fraud.fillna(0).drop(['uid'], axis=1)

bunp_fraud['month_at'] = bunp_fraud['updated_at'].astype(str).str[:7]

bunp_fraud.tail()

Unnamed: 0,updated_at,category_x,buyer_uid,seller_uid,total_price,category_3,help,block,block_by_operation,black_source,category_y,name,month_at
11088665,2019-07-02 09:22:43,600700002,428300,9139471,20000,600,0.0,0.0,0.0,0,600,디지털/가전,2019-07
11088666,2019-07-02 10:07:24,900220999,4583984,3727301,24000,900,0.0,0.0,0.0,0,900,도서/티켓/취미/애완,2019-07
11088667,2019-07-02 11:42:04,700100100,9303879,5499956,80000,700,0.0,0.0,0.0,0,700,스포츠/레저,2019-07
11088668,2019-07-02 13:10:52,400010100,8693877,5293606,38000,400,1.0,0.0,0.0,help,400,패션잡화,2019-07
11088669,2019-07-02 13:15:25,600700999,5696564,9138584,70000,600,1.0,1.0,0.0,help&block,600,디지털/가전,2019-07


In [214]:
bunp_fraud_agg = bunp_fraud.groupby(['month_at', 'name','black_source'], as_index=False).agg({'total_price': 'sum', 'updated_at': 'count'})

# save
save_path = '../csv/tmp/bunp_fraud_agg.csv'
bunp_fraud_agg.to_csv(save_path, index=False, mode='w', header=True)

bunp_fraud_agg.tail()

Unnamed: 0,month_at,name,black_source,total_price,updated_at
3481,2019-07,패션잡화,help,54521212,564
3482,2019-07,패션잡화,help&block,3746500,19
3483,2019-07,패션잡화,help&block&operation,97000,4
3484,2019-07,패션잡화,help&operation,712504,11
3485,2019-07,패션잡화,operation,50000,1


## black user와 white user의 이름 변경 횟수 T-Test

In [135]:
read_path = '../csv/changed_name_raw.csv'
fraud_name_raw = pd.read_csv(read_path)

fraud_name_raw.tail()

Unnamed: 0,date,uid
5823231,2019-07-04 10:53:31,9314176
5823232,2019-07-04 10:53:40,6263122
5823233,2019-07-04 10:53:44,6590985
5823234,2019-07-04 10:54:22,9325264
5823235,2019-07-04 10:54:29,3305677


In [136]:
fraud_name = fraud_name_raw.groupby(['uid'], as_index=False).agg({'date': 'count'})

fraud_name = fraud_name.rename(columns={'date': 'changed'})
fraud_name['uid'] = fraud_name['uid'].astype(str)

fraud_name.tail()

Unnamed: 0,uid,changed
4099311,9325552,1
4099312,9325572,1
4099313,9325578,1
4099314,9325579,1
4099315,9325602,1


In [137]:
fraud_name_uid = pd.merge(fraud_name, fraud_uid, on=['uid'], how='left')

fraud_name_uid = fraud_name_uid.drop(['black_source'], axis=1).fillna(0)

fraud_name_uid.tail()

Unnamed: 0,uid,changed,help,block,block_by_operation
4102893,9325552,1,0.0,0.0,0.0
4102894,9325572,1,0.0,0.0,0.0
4102895,9325578,1,0.0,0.0,0.0
4102896,9325579,1,0.0,0.0,0.0
4102897,9325602,1,0.0,0.0,0.0


### help, 상담센터 신고된 유저

In [143]:
fraud_name_uid_normal = fraud_name_uid[fraud_name_uid['help'] == 0]
fraud_name_uid_black = fraud_name_uid[fraud_name_uid['help'] == 1]

In [144]:
# 샘플 수 비교
np.size(fraud_name_uid_normal), np.size(fraud_name_uid_black)

(20023475, 491015)

In [145]:
# 평균 비교
np.mean(fraud_name_uid_normal['changed']), np.mean(fraud_name_uid_black['changed'])

(1.3953222405201895, 2.4676741036424548)

In [146]:
# 등분산 검정
leveneTest = stats.levene(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'])

leveneTest

LeveneResult(statistic=66415.25057459572, pvalue=0.0)

In [147]:
tTestResultName = stats.ttest_ind(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-124.22846763126933, pvalue=0.0)

### block, 영구 제재된 유저

In [148]:
fraud_name_uid_normal = fraud_name_uid[fraud_name_uid['block'] == 0]
fraud_name_uid_black = fraud_name_uid[fraud_name_uid['block'] == 1]

In [149]:
# 샘플 수 비교
np.size(fraud_name_uid_normal), np.size(fraud_name_uid_black)

(19938910, 575580)

In [150]:
# 평균 비교
np.mean(fraud_name_uid_normal['changed']), np.mean(fraud_name_uid_black['changed'])

(1.4122504690577369, 1.723704784738872)

In [151]:
# 등분산 검정
leveneTest = stats.levene(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'])

leveneTest

LeveneResult(statistic=6905.673826670213, pvalue=0.0)

In [152]:
tTestResultName = stats.ttest_ind(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-64.18757007264328, pvalue=0.0)

### operation, 운영팀에서 처리한 유저

In [153]:
fraud_name_uid_normal = fraud_name_uid[fraud_name_uid['block_by_operation'] == 0]
fraud_name_uid_black = fraud_name_uid[fraud_name_uid['block_by_operation'] == 1]

In [154]:
# 샘플 수 비교
np.size(fraud_name_uid_normal), np.size(fraud_name_uid_black)

(20440575, 73915)

In [155]:
# 평균 비교
np.mean(fraud_name_uid_normal['changed']), np.mean(fraud_name_uid_black['changed'])

(1.4194603136164223, 1.8437394304268417)

In [156]:
# 등분산 검정
leveneTest = stats.levene(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'])

leveneTest

LeveneResult(statistic=1684.9507456741444, pvalue=0.0)

In [157]:
tTestResultName = stats.ttest_ind(fraud_name_uid_normal['changed'], fraud_name_uid_black['changed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-28.38320412831144, pvalue=1.2879771204519969e-172)

## black user와 white user의 휴대폰 변경 횟수 T-Test

In [158]:
read_path = '../csv/changed_phone_raw.csv'
fraud_phone_raw = pd.read_csv(read_path)

fraud_phone_raw.tail()

Unnamed: 0,date,uid
434520,2019-07-04 11:54:52,5000124
434521,2019-07-04 11:59:15,9245231
434522,2019-07-04 11:59:15,7375046
434523,2019-07-04 11:59:45,9325895
434524,2019-07-04 11:59:45,5000124


In [159]:
fraud_phone = fraud_phone_raw.groupby(['uid'], as_index=False).agg({'date': 'count'})

fraud_phone = fraud_phone.rename(columns={'date': 'changed'})
fraud_phone['uid'] = fraud_phone['uid'].astype(str)

fraud_phone.tail()

Unnamed: 0,uid,changed
351970,9325547,1
351971,9325556,1
351972,9325757,1
351973,9325864,1
351974,9325895,1


In [160]:
fraud_phone_uid = pd.merge(fraud_phone, fraud_uid, on=['uid'], how='left')

fraud_phone_uid = fraud_phone_uid.drop(['black_source'], axis=1).fillna(0)

fraud_phone_uid.tail()

Unnamed: 0,uid,changed,help,block,block_by_operation
352769,9325547,1,0.0,0.0,0.0
352770,9325556,1,0.0,0.0,0.0
352771,9325757,1,0.0,0.0,0.0
352772,9325864,1,0.0,0.0,0.0
352773,9325895,1,0.0,0.0,0.0


### help, 상담센터 신고된 유저

In [161]:
fraud_phone_uid_normal = fraud_phone_uid[fraud_phone_uid['help'] == 0]
fraud_phone_uid_black = fraud_phone_uid[fraud_phone_uid['help'] == 1]

In [162]:
# 샘플 수 비교
np.size(fraud_phone_uid_normal), np.size(fraud_phone_uid_black)

(1671120, 92750)

In [163]:
# 평균 비교
np.mean(fraud_phone_uid_normal['changed']), np.mean(fraud_phone_uid_black['changed'])

(1.2197358657666715, 1.5118059299191375)

In [164]:
# 등분산 검정
leveneTest = stats.levene(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'])

leveneTest

LeveneResult(statistic=2796.678878461164, pvalue=0.0)

In [165]:
tTestResultName = stats.ttest_ind(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-32.85767789642815, pvalue=1.9805463786845297e-230)

### block, 영구 제재된 유저

In [166]:
fraud_phone_uid_normal = fraud_phone_uid[fraud_phone_uid['block'] == 0]
fraud_phone_uid_black = fraud_phone_uid[fraud_phone_uid['block'] == 1]

In [167]:
# 샘플 수 비교
np.size(fraud_phone_uid_normal), np.size(fraud_phone_uid_black)

(1672205, 91665)

In [168]:
# 평균 비교
np.mean(fraud_phone_uid_normal['changed']), np.mean(fraud_phone_uid_black['changed'])

(1.2256511611913612, 1.4073528609611083)

In [169]:
# 등분산 검정
leveneTest = stats.levene(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'])

leveneTest

LeveneResult(statistic=1065.2168015751035, pvalue=2.679807089971534e-233)

In [170]:
tTestResultName = stats.ttest_ind(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-25.131804181914454, pvalue=3.4213735343158866e-137)

### operation, 운영팀에서 처리한 유저

In [171]:
fraud_phone_uid_normal = fraud_phone_uid[fraud_phone_uid['block_by_operation'] == 0]
fraud_phone_uid_black = fraud_phone_uid[fraud_phone_uid['block_by_operation'] == 1]

In [172]:
# 샘플 수 비교
np.size(fraud_phone_uid_normal), np.size(fraud_phone_uid_black)

(1749280, 14590)

In [173]:
# 평균 비교
np.mean(fraud_phone_uid_normal['changed']), np.mean(fraud_phone_uid_black['changed'])

(1.233250251532059, 1.456134338588074)

In [174]:
# 등분산 검정
leveneTest = stats.levene(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'])

leveneTest

LeveneResult(statistic=266.26739445397175, pvalue=7.769769549404881e-60)

In [175]:
tTestResultName = stats.ttest_ind(fraud_phone_uid_normal['changed'], fraud_phone_uid_black['changed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-13.002796945918348, pvalue=1.253305388759693e-37)

## black user와 white user의 리뷰 작성 횟수 T-Test

In [176]:
read_path = '../csv/review_writer.csv'
fraud_review_writer_raw = pd.read_csv(read_path)

fraud_review_writer_raw.tail()

Unnamed: 0,time,writer_uid
5902957,2019-07-04 12:22:48,6439210
5902958,2019-07-04 12:23:38,4204619
5902959,2019-07-04 12:23:42,8994648
5902960,2019-07-04 12:23:47,1045672
5902961,2019-07-04 12:27:13,7810960


In [177]:
fraud_review_writer = fraud_review_writer_raw.groupby(['writer_uid'], as_index=False).agg({'time': 'count'})

fraud_review_writer = fraud_review_writer.rename(columns={'time': 'writed', 'writer_uid': 'uid'})
fraud_review_writer['uid'] = fraud_review_writer['uid'].astype(str)

fraud_review_writer.tail()

Unnamed: 0,uid,writed
1110787,9316109,1
1110788,9320668,1
1110789,9321110,1
1110790,9321727,1
1110791,9322225,1


In [178]:
fraud_review_writer_uid = pd.merge(fraud_review_writer, fraud_uid, on=['uid'], how='left')

fraud_review_writer_uid = fraud_review_writer_uid.drop(['black_source'], axis=1).fillna(0)

fraud_review_writer_uid.tail()

Unnamed: 0,uid,writed,help,block,block_by_operation
1112445,9316109,1,0.0,0.0,0.0
1112446,9320668,1,0.0,0.0,0.0
1112447,9321110,1,0.0,0.0,0.0
1112448,9321727,1,0.0,0.0,0.0
1112449,9322225,1,0.0,0.0,0.0


### help, 상담센터 신고된 유저

In [179]:
fraud_review_writer_uid_normal = fraud_review_writer_uid[fraud_review_writer_uid['help'] == 0]
fraud_review_writer_uid_black = fraud_review_writer_uid[fraud_review_writer_uid['help'] == 1]

In [180]:
# 샘플 수 비교
np.size(fraud_review_writer_uid_normal), np.size(fraud_review_writer_uid_black)

(5238700, 323550)

In [181]:
# 평균 비교
np.mean(fraud_review_writer_uid_normal['writed']), np.mean(fraud_review_writer_uid_black['writed'])

(4.822640158818027, 13.474748879616751)

In [182]:
# 등분산 검정
leveneTest = stats.levene(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'])

leveneTest

LeveneResult(statistic=15048.795104815888, pvalue=0.0)

In [183]:
tTestResultName = stats.ttest_ind(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-54.202946790032854, pvalue=0.0)

### block, 영구 제재된 유저

In [184]:
fraud_review_writer_uid_normal = fraud_review_writer_uid[fraud_review_writer_uid['block'] == 0]
fraud_review_writer_uid_black = fraud_review_writer_uid[fraud_review_writer_uid['block'] == 1]

In [185]:
# 샘플 수 비교
np.size(fraud_review_writer_uid_normal), np.size(fraud_review_writer_uid_black)

(5299480, 262770)

In [186]:
# 평균 비교
np.mean(fraud_review_writer_uid_normal['writed']), np.mean(fraud_review_writer_uid_black['writed'])

(5.218543140081668, 7.491551546980249)

In [187]:
# 등분산 검정
leveneTest = stats.levene(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'])

leveneTest

LeveneResult(statistic=789.837279523001, pvalue=1.0061173356686477e-173)

In [188]:
tTestResultName = stats.ttest_ind(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-27.04990176530938, pvalue=4.0765220140499386e-160)

### operation, 운영팀에서 처리한 유저

In [190]:
fraud_review_writer_uid_normal = fraud_review_writer_uid[fraud_review_writer_uid['block_by_operation'] == 0]
fraud_review_writer_uid_black = fraud_review_writer_uid[fraud_review_writer_uid['block_by_operation'] == 1]

In [191]:
# 샘플 수 비교
np.size(fraud_review_writer_uid_normal), np.size(fraud_review_writer_uid_black)

(5529015, 33235)

In [192]:
# 평균 비교
np.mean(fraud_review_writer_uid_normal['writed']), np.mean(fraud_review_writer_uid_black['writed'])

(5.298311724601941, 9.919512562058072)

In [193]:
# 등분산 검정
leveneTest = stats.levene(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'])

leveneTest

LeveneResult(statistic=470.36120655515845, pvalue=2.810903527047842e-104)

In [194]:
tTestResultName = stats.ttest_ind(fraud_review_writer_uid_normal['writed'], fraud_review_writer_uid_black['writed'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-14.993868766222317, pvalue=5.209686845031913e-50)

# 변수 기초 데이터 읽기

### 이름 변경 내역 파일 읽기

In [120]:
read_path = '../csv/changed_name.csv'
fraud_name = pd.read_csv(read_path)

fraud_name['is_black'] = fraud_name['is_black'].replace('normal', 0)
fraud_name['is_black'] = fraud_name['is_black'].replace('black', 1)

fraud_name.tail()

Unnamed: 0,is_black,uid,changed_name
4095222,0,9315495,1
4095223,0,9315496,1
4095224,0,9315512,1
4095225,0,9315513,1
4095226,0,9315544,1


### T-Test

In [121]:
fraud_name_normal = fraud_name[fraud_name['is_black'] == 0]
fraud_name_black = fraud_name[fraud_name['is_black'] == 1]

In [122]:
# 샘플 수 비교
np.size(fraud_name_normal['changed_name']), np.size(fraud_name_black['changed_name'])

(3983427, 111800)

In [123]:
# 평균 비교
np.mean(fraud_name_normal['changed_name']), np.mean(fraud_name_black['changed_name'])

(1.4122430259171312, 1.7182200357781754)

In [124]:
tTestResultName = stats.ttest_ind(fraud_name_normal['changed_name'], fraud_name_black['changed_name'], equal_var=False)

tTestResultName

Ttest_indResult(statistic=-62.24629954059341, pvalue=0.0)

### 폰 변경 내역 파일 읽기

In [None]:
read_path = '../csv/changed_phone.csv'
fraud_phone = pd.read_csv(read_path)

fraud_phone['is_black'] = fraud_phone['is_black'].replace('normal', 0)
fraud_phone['is_black'] = fraud_phone['is_black'].replace('black', 1)

fraud_phone.tail()

### T-Test

In [None]:
fraud_phone_normal = fraud_phone[fraud_phone['is_black'] == 0]
fraud_phone_black = fraud_phone[fraud_phone['is_black'] == 1]

In [None]:
# 샘플 수 비교
np.size(fraud_phone_normal['changed_phone']), np.size(fraud_phone_black['changed_phone'])

In [None]:
# 평균 비교
np.mean(fraud_phone_normal['changed_phone']), np.mean(fraud_phone_black['changed_phone'])

In [None]:
tTestResultPhone = stats.ttest_ind(fraud_phone_normal['changed_phone'], fraud_phone_black['changed_phone'], equal_var=False)

tTestResultPhone

### 리뷰 받은 사람 파일 읽기

In [125]:
read_path = '../csv/review_ee.csv'
fraud_review_ee = pd.read_csv(read_path)

fraud_review_ee['is_black'] = fraud_review_ee['is_black'].replace('normal', 0)
fraud_review_ee['is_black'] = fraud_review_ee['is_black'].replace('black', 1)

fraud_review_ee = fraud_review_ee.rename(columns={'shop_uid': 'uid'})

fraud_review_ee.tail()

Unnamed: 0,is_black,uid,review
960295,0,9303612,1
960296,0,9305161,1
960297,0,9310247,1
960298,0,9310368,1
960299,0,9313138,1


### T-Test

In [126]:
fraud_review_ee_normal = fraud_review_ee[fraud_review_ee['is_black'] == 0]
fraud_review_ee_black = fraud_review_ee[fraud_review_ee['is_black'] == 1]

In [127]:
# 샘플 수 비교
np.size(fraud_review_ee_normal['review']), np.size(fraud_review_ee_black['review'])

(897324, 62976)

In [128]:
# 평균 비교
np.mean(fraud_review_ee_normal['review']), np.mean(fraud_review_ee_black['review'])

(5.97372520962328, 8.573234247967479)

In [129]:
tTestResultReviewEe = stats.ttest_ind(fraud_review_ee_normal['review'], fraud_review_ee_black['review'], equal_var=False)

tTestResultReviewEe

Ttest_indResult(statistic=-23.21982978574343, pvalue=7.980710979230879e-119)

### 리뷰 남긴 사람 파일 읽기

In [130]:
read_path = '../csv/review_er.csv'
fraud_review_er = pd.read_csv(read_path)

fraud_review_er['is_black'] = fraud_review_er['is_black'].replace('normal', 0)
fraud_review_er['is_black'] = fraud_review_er['is_black'].replace('black', 1)

fraud_review_er = fraud_review_er.rename(columns={'writer_uid': 'uid'})

fraud_review_er.tail()

Unnamed: 0,is_black,uid,review
1109973,0,9309329,1
1109974,0,9309339,1
1109975,0,9309423,1
1109976,0,9311843,1
1109977,0,9313932,1


### T-Test

In [131]:
fraud_review_er_normal = fraud_review_er[fraud_review_er['is_black'] == 0]
fraud_review_er_black = fraud_review_er[fraud_review_er['is_black'] == 1]

In [132]:
# 샘플 수 비교
np.size(fraud_review_er_normal['review']), np.size(fraud_review_er_black['review'])

(1058974, 51004)

In [133]:
# 평균 비교
np.mean(fraud_review_er_normal['review']), np.mean(fraud_review_er_black['review'])

(5.215598305529692, 7.393478942828014)

In [134]:
tTestResultReviewEr = stats.ttest_ind(fraud_review_er_normal['review'], fraud_review_er_black['review'], equal_var=False)

tTestResultReviewEr

Ttest_indResult(statistic=-25.846254524648977, pvalue=2.049059005337516e-146)

# 데이터 merge

In [None]:
fraud_1 = pd.merge(fraud_name, fraud_phone, on =['is_black', 'uid'], how='outer')

fraud_2 = pd.merge(fraud_1, fraud_review_ee, on =['is_black', 'uid'], how='outer')

fraud = pd.merge(fraud_2, fraud_review_er, on =['is_black', 'uid'], how='outer')

fraud = fraud.rename(columns={'review_x': 'review_ee', 'review_y': 'review_er'})

fraud.fillna(0)

In [None]:
y = fraud_name['is_black']
X = fraud_name['changed_name']

logit_mod = sm.Logit(y, X)
# logit_res = logit_mod.fit(disp=0)
logit_res = logit_mod.fit()
print(logit_res.summary())

In [None]:
xx = np.linspace(0, 100, 200)


plt.scatter(X, logit_res.predict(X), label=r"$\hat{y}$", marker='x', c=y,
            s=200, lw=2, alpha=0.5)

plt.legend(loc = "best")
plt.show()

In [None]:
logit_res = sm.Logit.from_formula('is_black ~ changed_name', fraud_name).fit()

logit_res.summary()

In [None]:
plt.scatter(fraud_name['changed_name'], fraud_name['is_black'], label="data", marker='o')

plt.legend(loc = "best")
plt.show()

In [None]:
np.exp(logis.params)

In [None]:
params = logis.params
conf = logis.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
print(np.exp(conf))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
Y = fraud_name['is_black']
X = fraud_name.drop(['is_black'], axis=1)

# Y.reshape(1, -1)
# X = X.reshape(1, -1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)


In [None]:
log_clf = LogisticRegression()
log_clf.fit(X_train,Y_train)  # training 데이터로 학습 시키기
log_clf.score(X_test, Y_test)  # the mean accuracy on the given test data and labels.

In [None]:
#test(valid)/train 을 2:8 로 randomly select
train, valid = train_test_split(fraud_name, test_size=0.2, random_state=0)

#train/valid set 완성
train_y = train['is_black']
train_x = train.drop(['is_black'], axis=1)

valid_y = valid['is_black']
valid_x = valid.drop(['is_black'],axis=1)

In [None]:
# lr = LogisticRegression(C=1000.0, random_state=0)

lr = LogisticRegression()
lr.fit(train_x, train_y)  # Fit the model according to the given training data.
pred_y = lr.predict(valid_x)  # Predict class labels for samples in X.

In [None]:
pred_y

In [None]:
lr

In [None]:
print("Misclassification samples : %d" %(valid_y != pred_y).sum())

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score

print("accuracy: %.3f" %accuracy_score(valid_y, pred_y))  # Accuracy classification score
print("Precision : %.3f" % precision_score(valid_y, pred_y))
print("Recall : %.3f" % recall_score(valid_y, pred_y))
print("F1 : %.3f" % f1_score(valid_y, pred_y))

In [None]:
fraud_name_10 = fraud_name[fraud_name['changed_name'] <= 10]

In [None]:
y = fraud_name_10['is_black']
X = fraud_name_10['changed_name']

logit_mod = sm.Logit(y, X)
# logit_res = logit_mod.fit(disp=0)
logit_res = logit_mod.fit()
print(logit_res.summary())

In [None]:
logit_res = sm.Logit.from_formula('is_black ~ changed_name', fraud_name_10).fit()

logit_res.summary()