In [313]:
import pandas as pd
import numpy as np
from datetime import tzinfo, timedelta, datetime
import os
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import re
from tqdm import tqdm, trange, tqdm_notebook
from multiprocessing import pool
import seaborn as sns
from pivottablejs import pivot_ui
from konlpy.tag import Okt  

#### 구매자 판매자 데이터 로드


In [314]:
df_bunp_all = pd.read_csv('../../mac/Bunjang_csv_data/bunp_seller_buyer_201701_201905.csv')
df_bunpay_transfer_all = pd.read_csv('../../mac/Bunjang_csv_data/bunpay_transfer_seller_buyer_201701_201905.csv')

#### 데이터 통합

In [315]:
df_bunp_all.head()

Unnamed: 0,updated_at,type,seller_uid,buyer_uid,total_price
0,2019-01-06 00:29:51,bunp,7290,8037251,37000
1,2019-05-18 18:37:46,bunp,8847,3673304,8000
2,2017-06-03 00:21:37,bunp,5479,3313173,800000
3,2018-11-27 18:15:18,bunp,13475,38784,19900
4,2017-07-01 17:23:10,bunp,19041,3934154,39000


In [316]:
df_bunpay_transfer_all.head()

Unnamed: 0,updated_at,type,seller_uid,buyer_uid,total_price
0,2018-04-17 17:09:43,bunpay,6553285,6216460,173550
1,2018-04-17 17:09:43,bunpay,6823120,6595915,248200
2,2018-04-17 17:09:43,bunpay,654290,1431716,49410
3,2018-04-17 17:09:43,bunpay,3004670,718680,150200
4,2018-04-17 17:09:43,bunpay,2685990,3332255,310000


In [317]:
df_transaction_all = pd.concat([df_bunp_all, df_bunpay_transfer_all], axis=0)

In [318]:
df_transaction_all.head()

Unnamed: 0,updated_at,type,seller_uid,buyer_uid,total_price
0,2019-01-06 00:29:51,bunp,7290,8037251,37000
1,2019-05-18 18:37:46,bunp,8847,3673304,8000
2,2017-06-03 00:21:37,bunp,5479,3313173,800000
3,2018-11-27 18:15:18,bunp,13475,38784,19900
4,2017-07-01 17:23:10,bunp,19041,3934154,39000


In [319]:
df_transaction_all.shape

(4158151, 5)

In [320]:
df_transaction_all.updated_at.max()

'2019-05-31 23:58:00'

In [321]:
df_transaction_all.updated_at.min()

'2017-01-01 00:04:02'

#### 사기 데이터 불러오기

In [322]:
df_fraud = pd.read_csv('../Bunjang_csv_data/fraud_report_data_2019_2016.csv')

In [323]:
df_fraud.head()

Unnamed: 0,신고일자,거래품목,거래금액,신고자uid,신고대상uid
0,2010.2.11,갤럭시 s8,100000,5160590.0,5815049
1,2015.1.12,노트3 SK/LG,70000,299974.0,4222952
2,2015.1.13,아이폰6,250000,3040425.0,4245137
3,2015.1.15,게임계정,50000,4244022.0,4109047
4,2015.1.16,갤럭시노트3,125000,2479816.0,4248663


In [324]:
df_fraud.columns = ['reported_date', 'product_name', 'price', 'reporter_uid', 'fraud_uid']

In [325]:
df_fraud.dropna(inplace=True)

In [326]:
df_fraud.reset_index(inplace=True, drop=True)

In [327]:
df_fraud.head()

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid
0,2010.2.11,갤럭시 s8,100000,5160590.0,5815049
1,2015.1.12,노트3 SK/LG,70000,299974.0,4222952
2,2015.1.13,아이폰6,250000,3040425.0,4245137
3,2015.1.15,게임계정,50000,4244022.0,4109047
4,2015.1.16,갤럭시노트3,125000,2479816.0,4248663


#### 사기 데이터 중에서 현재 차단중인 사용자만 분류

In [329]:
df_blocked = pd.read_csv('../Bunjang_csv_data/blocked_uid_list_20190702.csv')

In [330]:
df_blocked.head()

Unnamed: 0,uid
0,0
1,3
2,6
3,2709
4,3203


In [331]:
df_blocked.columns = ['fraud_uid']

In [333]:
df_fraud = pd.merge(df_fraud, df_blocked, on='fraud_uid', how='inner')

In [335]:
df_fraud.shape

(13763, 5)

#### 사기 데이터 전처리
- 거래 데이터가 2017년 1월부터이므로 신고 접수일이 2017년 1월 이후인 것으로 분류

In [336]:
df_fraud = df_fraud.loc[df_fraud.reported_date >= '2017-01-01']

In [337]:
df_fraud.reset_index(inplace=True, drop=True)

In [338]:
df_fraud.shape

(9827, 5)

In [339]:
df_fraud.head()

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid
0,2017.1.15,컴퓨터 본체,170000,3651257.0,4582938
1,2017.1.5,캘빈클라인 청자켓,40000,2519709.0,4788446
2,2017.4.25,롤 아이디 팝니다,15000,5523143.0,2153536
3,2017.8.27,박세웅 사인볼,23000,5975291.0,247569
4,2017.1.15,아이폰 6플러스 골드 64기가,310000,4016224.0,956841


#### 거래품목으로 카테고리 분류

In [340]:
okt=Okt()  

def tokenizer_okt_morphs(doc):
    return okt.morphs(doc)

def tokenizer_okt_noun(doc):
    return okt.nouns(doc)

def tokenizer_okt_pos(doc):
    return okt.pos(doc, norm=True, stem=True)

In [341]:
df_fraud['product_name_word'] = df_fraud['product_name'].apply(lambda x: tokenizer_okt_morphs(x))

In [342]:
df_fraud.head()

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid,product_name_word
0,2017.1.15,컴퓨터 본체,170000,3651257.0,4582938,"[컴퓨터, 본체]"
1,2017.1.5,캘빈클라인 청자켓,40000,2519709.0,4788446,"[캘빈, 클라인, 청, 자켓]"
2,2017.4.25,롤 아이디 팝니다,15000,5523143.0,2153536,"[롤, 아이디, 팝니다]"
3,2017.8.27,박세웅 사인볼,23000,5975291.0,247569,"[박세웅, 사인, 볼]"
4,2017.1.15,아이폰 6플러스 골드 64기가,310000,4016224.0,956841,"[아이폰, 6, 플러스, 골드, 64, 기, 가]"


In [343]:
df_fraud.loc[df_fraud.product_name_word.apply(lambda x: len(x) < 1)]

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid,product_name_word
3733,2017.8.12,,820000,4809587.0,5909359,[]


In [344]:
tokens = [ t for d in df_fraud['product_name_word'] for t in d]

In [345]:
print(len(tokens))

31229


In [346]:
import nltk

In [347]:
text = nltk.Text(tokens, name='NMSC')

In [348]:
print(len(text.tokens))

31229


In [349]:
print(len(set(text.tokens)))

3485


In [350]:
print(text.vocab().most_common(90))

[('아이폰', 2540), ('6', 1132), ('s', 1110), ('7', 1070), ('에어팟', 904), ('갤럭시', 819), ('8', 620), ('플러스', 410), ('가', 354), ('노트', 346), ('기', 327), ('64', 292), ('5', 284), ('+', 284), ('지갑', 267), ('골드', 241), ('구찌', 231), ('상품권', 226), ('반', 209), ('즈', 198), ('2', 193), ('블랙', 193), ('롱패딩', 188), ('3', 184), ('무스', 184), ('너클', 181), ('로', 180), ('128', 173), ('4', 170), ('se', 158), ('g', 143), ('나이키', 141), ('문화', 138), ('S', 135), ('맨', 131), ('투맨', 130), ('티', 130), ('닌텐도', 115), ('패딩', 114), ('스네이크', 114), ('후드', 111), ('매트', 111), ('엣지', 110), ('노트북', 109), ('버니', 107), ('스웨터', 106), ('아이패드', 106), ('백', 104), ('x', 103), (',', 97), ('스', 94), ('아이', 93), ('레드', 90), ('삼성', 88), ('이', 84), ('스톤아일랜드', 82), ('스위치', 82), ('구', 76), ('팔', 74), ('프로', 73), ('집업', 72), ('루이비통', 72), ('데상트', 71), ('그레이', 71), ('gb', 68), ('티켓', 67), ('9', 67), ('G', 67), ('실버', 66), ('계정', 66), ('k', 65), ('디스커버리', 65), ('스페이스', 65), ('시계', 63), ('아디다스', 62), ('1', 62), ('장', 62), ('다이슨', 61), ('가방', 6

In [351]:
print(text.vocab().most_common()[:-20:-1])

[('마리오카트', 1), ('46', 1), ('풀셋트', 1), ('슬링', 1), ('스위', 1), ('5억', 1), ('bp', 1), ('페라리', 1), ('노초', 1), ('와플', 1), ('LDV', 1), ('사카이', 1), ('SA', 1), ('널지오', 1), ('셔', 1), ('방한', 1), ('널', 1), ('플펜슬', 1), ('하의', 1)]


In [352]:
word_dict = {
    '디지털/가전': ['아이폰','s','6', '갤럭시','7', '에어팟', '노트','5', '8', '64', '+', '2', '3',
               '4', 'gb', '플러스', '스페이스', 'lg', '9', 'x', 'G', '노트북', 'S', '아이패드', 
               '청소기', '에어팟', '노트', '플러스', '엣지', '삼성', '컴퓨터', '핸드폰', '다이슨',
               '애플', '갤럭시탭', '모니터', '이어폰', '휴대폰', '스마트폰', '겔럭시', '기어', '베가',
               '배터리', '패드', '미러리스', '아이팟', '맥북', '캐논', '소니', '카메라', '케이스',
               '에어컨', '공기청정기', '그램', '블루투스', '아이팟', '아이', '그래픽카드', '탭', '본체',
               '루나', '프로', '폰', '로지텍', '밥솥', '맥', '마우스', '제습기', '블랙베리',
               '인치', '청축', '태블릿', '램', '외장하드', '무선', '고데기', 'gtx', '1060', 'LG'
              ],
    '패션/잡화' : ['지갑', '가방', '신발', '클러치', '모자', '시계', '구두', '백', '목걸이', '팔찌',
               '반지', '슬리퍼', '축구화', '벨트', '금반지', '루이비통', '구찌', '샤넬', '지샥',
               '파우치', '가발', '선글라스', '머니클립', '발렌시아', '귀걸이', '안경'
              ],
    '패션/잡화_운동화/캐쥬얼화' : ['운동화','스니커즈', '조던', '맥스', '힙색', '부스트', '나이키',
                       '아디다스', '골든', '슈퍼스타'],
    '도서/티켓/애완/취미': ['상품권', '문화', '티켓', '콘서트', '데이터', '규어', '기프트카드', '카드',
                   '인형', '이용권', '책', '치킨', '문상', '스타벅스', '기프티콘'],
    '여성/남성의류': ['롱패딩', '옷', '패딩', '자켓', '후드', '집업', '스웨터', '원피스',
                '바지', '가디건', '맨투맨', '투맨', '티', '코트', '니트', '트레이닝', '바람막이',
                '트레이닝복' ,'셔츠', '의류', '폴라', '점퍼', '무스', '너클', '스톤아일랜드',
                '스커트', '져지', '팬츠', '레깅스', '노스페이스', '밀포드', '히말라야', '디스커버리',
                '캐나다구스', '파타고니아', '스퀘어', '치마', '수트', '티셔츠', '청바지', '스파이더'
               ],
    '문구/가구/식품' : ['인스', '분유'],
    '스포츠/레저': ['자전거', '픽시', '야구', '라켓'],
    '스타굿즈': ['방탄소년단', '엑소', '인피니트', '포카', '트와이스', '방탄', '포토', '앨범', '싸인', 
            '포스터', '벨벳', '에이핑크', '응원봉'],
    '디지털/가전_게임/타이틀' : ['닌텐도', '오버', '피파', '계정', '아이디', '게임', '롤', '플레이스테이션',
                      '플스', '몬스터', '엑스박스', '모두의마블'],
    '뷰티/미용' : ['입생로랑', '팩', '마스크', '틴트', '향수', '쿠션', '크림', '밤', '립스틱', '에뛰드',
              '화장품', '파운데이션'],
    '차량/오토바이' : ['오토바이'],
    '유아동/출산': ['소독기']
}

In [353]:
list_product_name = list(df_fraud.product_name)
list_category = []
for i in range(len(list_product_name)):    
    flag_word = False
    flag_key = False
#     print(i, list_product_name[i])
    for key in word_dict.keys():
        if flag_key == False:
            for word in word_dict[key]:
#                 print('key: {key} / word: {word}'.format(key=key, word=word))
                if word in list_product_name[i]:
                    list_category.append(key)
                    flag_key = True
#                     print('find_key')
                    break
#     print('end of search')
    flag_word = True
    if (flag_word == True) & (flag_key == False):
        list_category.append('no_category')
#         print('No key')

In [354]:
df_fraud.shape

(9827, 6)

In [355]:
df_fraud['category'] = pd.Series(list_category)

In [356]:
df_fraud_no_category = df_fraud.loc[df_fraud.category == 'no_category']

In [357]:
df_fraud_no_category.shape

(612, 7)

In [358]:
df_fraud_no_category.head(100)

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid,product_name_word,category
3,2017.8.27,박세웅 사인볼,23000,5975291.0,247569,"[박세웅, 사인, 볼]",no_category
10,2017.4.5,닐바렛 네오프렌,51500,5174537.0,405859,"[닐, 바렛, 네오프렌]",no_category
11,2017.2.13,향스프레이,18000,3518962.0,181589,"[향, 스프레이]",no_category
18,2017.1.4,뿌링클,23000,4517992.0,4355616,"[뿌, 링클]",no_category
25,2017.1.4,라이터,40000,5168350.0,4966224,[라이터],no_category
27,2018.2.17,잔트렉스블루,18000,5989592.0,4881626,"[잔트, 렉스, 블루]",no_category
29,2018.4.16,까르띠에정품러브링,160000,5155810.0,1173166,"[까, 르띠, 에, 정품, 러브, 링]",no_category
38,2017.3.15,대한간호협회문제집,6000,3307278.0,4653718,"[대, 한, 간호, 협회, 문, 제, 집]",no_category
39,2017.7.5,자이글,53000,2757941.0,1645813,"[자, 이글]",no_category
40,2017.4.11,랩핑지,10000,4148953.0,2487386,"[랩핑, 지]",no_category


#### 카테고리 분류 안된 단어 토큰 재확인

In [359]:
tokens_no_category = [ t for d in df_fraud_no_category['product_name_word'] for t in d]

In [360]:
text_no_category = nltk.Text(tokens_no_category, name='NMSC')

In [361]:
print(text_no_category.vocab().most_common(100))

[('즈', 19), ('이', 16), ('팔', 16), ('너', 15), ('보드', 14), ('굿', 13), ('반', 13), ('세트', 12), ('1', 10), ('의', 10), (',', 10), ('브라운', 9), ('원', 8), ('후', 7), ('스', 7), ('구', 7), ('아', 7), ('패치', 7), ('키', 7), ('네', 7), ('전', 7), ('동', 7), ('로', 7), ('워', 6), ('킥', 6), ('유니폼', 6), ('데상트', 6), ('틴', 6), ('블루', 5), ('제', 5), ('래', 5), ('라', 5), ('톰', 5), ('슬로건', 5), ('화이트', 5), ('드릴', 5), ('권', 5), ('세븐', 5), ('디스', 5), ('박스', 5), ('리스', 5), ('핑크', 5), ('에', 4), ('대', 4), ('지', 4), ('체크', 4), ('미니', 4), ('이름', 4), ('은', 4), ('플', 4), ('시', 4), ('탑', 4), ('곤약', 4), ('젤리', 4), ('에어', 4), ('일', 4), ('전자담배', 4), ('롱', 4), ('(', 4), (')', 4), ('오', 4), ('랜덤', 4), ('kt', 4), ('필립스', 4), ('허', 4), ('벌', 4), ('라이프', 4), ('봉', 4), ('업', 4), ('베이', 4), ('디테', 4), ('스타', 4), ('크롬', 4), ('글', 4), ('데스크탑', 4), ('오프', 4), ('카이', 4), ('링', 3), ('집', 3), ('반스', 3), ('검정', 3), ('양털', 3), ('디올', 3), ('와인', 3), ('스마트', 3), ('김치냉장고', 3), ('크', 3), ('트', 3), ('스톤', 3), ('와', 3), ('펜', 3), ('러너', 3), ('다운', 3), 

In [362]:
df_fraud.groupby('category').count()

Unnamed: 0_level_0,reported_date,product_name,price,reporter_uid,fraud_uid,product_name_word
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
no_category,612,612,612,612,612,612
도서/티켓/애완/취미,381,381,381,381,381,381
디지털/가전,6484,6484,6484,6484,6484,6484
디지털/가전_게임/타이틀,194,194,194,194,194,194
문구/가구/식품,16,16,16,16,16,16
뷰티/미용,46,46,46,46,46,46
스타굿즈,72,72,72,72,72,72
스포츠/레저,14,14,14,14,14,14
여성/남성의류,1033,1033,1033,1033,1033,1033
유아동/출산,1,1,1,1,1,1


#### 사기발생 전 거래건수 확인

In [363]:
df_fraud.head()

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid,product_name_word,category
0,2017.1.15,컴퓨터 본체,170000,3651257.0,4582938,"[컴퓨터, 본체]",디지털/가전
1,2017.1.5,캘빈클라인 청자켓,40000,2519709.0,4788446,"[캘빈, 클라인, 청, 자켓]",여성/남성의류
2,2017.4.25,롤 아이디 팝니다,15000,5523143.0,2153536,"[롤, 아이디, 팝니다]",디지털/가전
3,2017.8.27,박세웅 사인볼,23000,5975291.0,247569,"[박세웅, 사인, 볼]",no_category
4,2017.1.15,아이폰 6플러스 골드 64기가,310000,4016224.0,956841,"[아이폰, 6, 플러스, 골드, 64, 기, 가]",디지털/가전


In [364]:
df_transaction_all.head()

Unnamed: 0,updated_at,type,seller_uid,buyer_uid,total_price
0,2019-01-06 00:29:51,bunp,7290,8037251,37000
1,2019-05-18 18:37:46,bunp,8847,3673304,8000
2,2017-06-03 00:21:37,bunp,5479,3313173,800000
3,2018-11-27 18:15:18,bunp,13475,38784,19900
4,2017-07-01 17:23:10,bunp,19041,3934154,39000


In [365]:
list_fraud_uid = list(df_fraud.fraud_uid)

#### 첫 신고일자 기준으로 신고데이터 정리

In [366]:
df_fraud.fraud_uid.nunique()

7301

In [367]:
df_fraud.head()

Unnamed: 0,reported_date,product_name,price,reporter_uid,fraud_uid,product_name_word,category
0,2017.1.15,컴퓨터 본체,170000,3651257.0,4582938,"[컴퓨터, 본체]",디지털/가전
1,2017.1.5,캘빈클라인 청자켓,40000,2519709.0,4788446,"[캘빈, 클라인, 청, 자켓]",여성/남성의류
2,2017.4.25,롤 아이디 팝니다,15000,5523143.0,2153536,"[롤, 아이디, 팝니다]",디지털/가전
3,2017.8.27,박세웅 사인볼,23000,5975291.0,247569,"[박세웅, 사인, 볼]",no_category
4,2017.1.15,아이폰 6플러스 골드 64기가,310000,4016224.0,956841,"[아이폰, 6, 플러스, 골드, 64, 기, 가]",디지털/가전


In [368]:
df_fraud_first = df_fraud.sort_values('reported_date').groupby('fraud_uid').first()

In [369]:
df_fraud_first.reset_index(inplace=True)

In [370]:
df_fraud_first.shape

(7301, 7)

In [371]:
df_fraud_first.head()

Unnamed: 0,fraud_uid,reported_date,product_name,price,reporter_uid,product_name_word,category
0,5196,2019.1.16,에어팟,400000,3925192.0,[에어팟],디지털/가전
1,28276,2018.10.14,덕다운 롱패딩,128000,3124346.0,"[덕, 다운, 롱패딩]",여성/남성의류
2,30444,2017.10.18,톰브라운,18000,2222536.0,"[톰, 브라운]",no_category
3,34547,2017.7.10,이지부스트,360000,3717304.0,"[이, 지, 부스트]",패션/잡화_운동화/캐쥬얼화
4,38137,2017.2.20,나이키슬리퍼,23000,1850234.0,"[나이키, 슬리퍼]",패션/잡화


In [372]:
df_count_by_seller = pd.DataFrame(columns=['seller_uid', 'type', 'updated_at'])
for i in trange(df_fraud_first.shape[0]):
    df_data = df_transaction_all.loc[
        (df_transaction_all.seller_uid == df_fraud_first.fraud_uid[i]) &
        (df_transaction_all.updated_at < df_fraud_first.reported_date[i])
    ]
    if df_data.shape[0] > 0:
        df_count = df_data[['updated_at', 'seller_uid', 'type']].groupby(
            ['seller_uid', 'type']).count().reset_index()
        df_count_by_seller = pd.concat([df_count_by_seller, df_count])

100%|██████████| 7301/7301 [47:32<00:00,  2.56it/s] 


In [373]:
df_count_by_seller

Unnamed: 0,seller_uid,type,updated_at
0,28276,bunp,3
0,30444,bunp,2
0,34547,bunp,1
0,38137,bunp,7
0,39104,bunp,2
1,39104,bunpay,1
0,87523,bunp,10
0,103050,bunp,2
0,125571,bunp,2
0,137757,bunp,7


In [374]:
df_count_by_seller_pivot = df_count_by_seller.pivot('seller_uid', 'type')

In [375]:
df_count_by_seller_pivot.fillna(0, inplace=True)

In [376]:
df_count_by_seller_table = pd.DataFrame(df_count_by_seller_pivot.to_records())

In [377]:
df_count_by_seller_table.columns = ['fraud_uid', 'bunp', 'bunpay', 'transfer']

In [378]:
df_fraud_first.head()

Unnamed: 0,fraud_uid,reported_date,product_name,price,reporter_uid,product_name_word,category
0,5196,2019.1.16,에어팟,400000,3925192.0,[에어팟],디지털/가전
1,28276,2018.10.14,덕다운 롱패딩,128000,3124346.0,"[덕, 다운, 롱패딩]",여성/남성의류
2,30444,2017.10.18,톰브라운,18000,2222536.0,"[톰, 브라운]",no_category
3,34547,2017.7.10,이지부스트,360000,3717304.0,"[이, 지, 부스트]",패션/잡화_운동화/캐쥬얼화
4,38137,2017.2.20,나이키슬리퍼,23000,1850234.0,"[나이키, 슬리퍼]",패션/잡화


In [379]:
df_fraud_first.shape

(7301, 7)

#### 사기 데이터와 사기 발생 이전 거래 데이터 합치기

In [380]:
df_fraud_w_transaction = pd.merge(df_fraud_first, df_count_by_seller_table, on='fraud_uid', how = 'outer')

In [381]:
df_fraud_w_transaction.fillna(0, inplace=True)

In [382]:
df_fraud_w_transaction.head()

Unnamed: 0,fraud_uid,reported_date,product_name,price,reporter_uid,product_name_word,category,bunp,bunpay,transfer
0,5196,2019.1.16,에어팟,400000,3925192.0,[에어팟],디지털/가전,0.0,0.0,0.0
1,28276,2018.10.14,덕다운 롱패딩,128000,3124346.0,"[덕, 다운, 롱패딩]",여성/남성의류,3.0,0.0,0.0
2,30444,2017.10.18,톰브라운,18000,2222536.0,"[톰, 브라운]",no_category,2.0,0.0,0.0
3,34547,2017.7.10,이지부스트,360000,3717304.0,"[이, 지, 부스트]",패션/잡화_운동화/캐쥬얼화,1.0,0.0,0.0
4,38137,2017.2.20,나이키슬리퍼,23000,1850234.0,"[나이키, 슬리퍼]",패션/잡화,7.0,0.0,0.0


In [383]:
df_fraud_w_transaction.drop(columns=['product_name_word'], inplace=True)

In [384]:
df_fraud_w_transaction.shape

(7301, 9)

In [398]:
df_fraud_w_transaction.to_csv('../Downloads/fraud_list_uid.csv')

#### 전문상점 데이터 합치기

In [399]:
df_fraud_biz = pd.read_csv('../Bunjang_csv_data/biz_license.csv')

In [400]:
df_fraud_biz.head()

Unnamed: 0,uid,bizlicense
0,5196,0
1,28276,1
2,30444,0
3,34547,0
4,38137,0


In [401]:
df_fraud_biz.columns

Index(['uid', 'bizlicense'], dtype='object')

In [402]:
df_fraud_biz.columns = ['fraud_uid', 'is_bizlicense']

In [403]:
df_fraud_w_transaction_w_biz = pd.merge(df_fraud_w_transaction, df_fraud_biz, on='fraud_uid', how='outer')

In [404]:
df_fraud_w_transaction_w_biz.loc[df_fraud_w_transaction_w_biz.is_bizlicense.isna()]

Unnamed: 0,fraud_uid,reported_date,product_name,price,reporter_uid,category,bunp,bunpay,transfer,is_bizlicense


In [405]:
df_fraud_w_transaction_w_biz.is_bizlicense.fillna('no_data', inplace=True)

In [406]:
df_fraud_w_transaction_w_biz

Unnamed: 0,fraud_uid,reported_date,product_name,price,reporter_uid,category,bunp,bunpay,transfer,is_bizlicense
0,5196,2019.1.16,에어팟,400000,3925192.0,디지털/가전,0.0,0.0,0.0,0
1,28276,2018.10.14,덕다운 롱패딩,128000,3124346.0,여성/남성의류,3.0,0.0,0.0,1
2,30444,2017.10.18,톰브라운,18000,2222536.0,no_category,2.0,0.0,0.0,0
3,34547,2017.7.10,이지부스트,360000,3717304.0,패션/잡화_운동화/캐쥬얼화,1.0,0.0,0.0,0
4,38137,2017.2.20,나이키슬리퍼,23000,1850234.0,패션/잡화,7.0,0.0,0.0,0
5,39104,2019.1.2,스피드러너 40c,500000,6549814.0,디지털/가전,2.0,1.0,0.0,0
6,55453,2017.1.17,기가 유무선 위보 공유기 뿔4개 듀얼코어,20000,4335356.0,디지털/가전,0.0,0.0,0.0,0
7,55883,2018.1.28,구찌벨트,150000,5255156.0,패션/잡화,0.0,0.0,0.0,0
8,73040,2018.12.8,에어팟,322000,3092870.0,디지털/가전,0.0,0.0,0.0,0
9,81559,2018.12.13,커버낫 롱패딩,180000,1567089.0,여성/남성의류,0.0,0.0,0.0,0


In [407]:
df_fraud_w_transaction_w_biz.to_csv('../Downloads/fraud_transaction_w_biz.csv')