In [1]:
import pandas as pd

# trade 데이터로부터 이상값들을 추적한다.

In [2]:
tr = pd.read_csv('../preprocess/train_trade_pre.csv')
te = pd.read_csv('../preprocess/test_trade_pre.csv')
to = pd.concat([tr, te]).drop_duplicates().sort_values(by=list(tr.columns)).reset_index(drop=True)
to.columns

Index(['trade_week', 'trade_day', 'trade_time', 'source_acc_id',
       'target_acc_id', 'item_type', 'item_amount'],
      dtype='object')

# auto 계정들과, 버스기사, 골드 구매자를 구분하기 위한 과정이다.
# source가 target에게 여러가지 재화를 지급한 목록을 구한다. 

In [3]:
dup = to[to.duplicated(['trade_week','trade_day','trade_time','source_acc_id','target_acc_id'])]

sou = pd.DataFrame(dup['source_acc_id'].unique())
tar = pd.DataFrame(dup['target_acc_id'].unique())
sou.rename(columns={0:'acc_id'}, inplace=True)
tar.rename(columns={0:'acc_id'}, inplace=True)
sou = sou.sort_values(by='acc_id').reset_index(drop=True)
tar = tar.sort_values(by='acc_id').reset_index(drop=True)

print('Number of duplicated Source : {}\nNumber of duplicated Target : {}'.format(len(sou), len(tar)))

Number of duplicated Source : 106926
Number of duplicated Target : 9897


# Auto 분류는 sou, tar 데이터를 이용하여 2번 스크립트에서 추려내며,
# 현재 1번 스크립트에서는 버스기사나 레이드 활동자를 추려낸다.

In [4]:
tar.to_csv('auto_target_dup.csv', index=False)
sou = pd.DataFrame(set(sou['acc_id']) - set(tar['acc_id'])).rename(columns={0:'acc_id'}).sort_values(by='acc_id')
sou.to_csv('auto_source_dup.csv', index=False)

# 버스기사 또는 레이드 이익잉여금 분배는 다음과 같이 이뤄진다.
* 1명의 Source가 money를 같은 금액으로 다수의 Target과 나눠가진다.
* 때문에 duplicated 조건에 source_acc_id와 item_amount가 반드시 포함되어야 한다.

In [5]:
bus_or_raid = to[to.duplicated(['trade_week', 'source_acc_id', 'item_type', 'item_amount'])]
bus_or_raid.head()

Unnamed: 0,trade_week,trade_day,trade_time,source_acc_id,target_acc_id,item_type,item_amount
27,1,1,00:00:13,2787d157b3e525366c5ab3abdce58eea840b054f054817...,3c4778dd996dff88720acbbf95b0d49ab5f559403501a3...,money,220.0
35,1,1,00:00:17,a4785c1f445c5b9e93e84fac2ff7709e30caca844d5c01...,157dce3404f0fa27ee842dcfb3d4c82aa4eb2e94ff76b3...,money,220.0
36,1,1,00:00:19,cee3a04d387f99422402a1984eb8f002c4368dc78b4dba...,0f0819d7646b3f7dbeac6e1d08efe54dae35bdf343f22b...,money,4500.0
46,1,1,00:00:26,cee3a04d387f99422402a1984eb8f002c4368dc78b4dba...,cb73cb11a224821423b269b8a95d111b056dd6777a946e...,money,4500.0
47,1,1,00:00:27,2787d157b3e525366c5ab3abdce58eea840b054f054817...,32bf3bcac278f6ca6dc94e48283641609e401f3c849d0a...,money,220.0


In [6]:
money_trader = bus_or_raid[bus_or_raid['item_type']=='money']
driver_receiver = sorted(set(money_trader['source_acc_id']) | set(money_trader['target_acc_id']))
b_or_r = pd.DataFrame(driver_receiver).rename(columns={0: 'acc_id'})

In [7]:
driver = pd.merge(to, b_or_r.rename(columns={'acc_id': 'source_acc_id'}), how='inner').sort_values(by=list(to.columns))

### 1000 금 이상 오간 데이터만 확인한다.

In [8]:
temp = driver[(driver['item_type']=='money') & (driver['item_amount']>=1000)].reset_index(drop=True)
temp.head()

Unnamed: 0,trade_week,trade_day,trade_time,source_acc_id,target_acc_id,item_type,item_amount
0,1,1,00:00:06,cee3a04d387f99422402a1984eb8f002c4368dc78b4dba...,ed7f1e4d1322dd36a8cb73f791bbf394d7261ee402eafb...,money,4500.0
1,1,1,00:00:10,5e1cdb11a4b1a43f7dc8a4b9c34f5beeef2f80bace1398...,bb113c4f182d3ebce618696784c24bd5d2f9698999ca78...,money,1000.0
2,1,1,00:00:19,cee3a04d387f99422402a1984eb8f002c4368dc78b4dba...,0f0819d7646b3f7dbeac6e1d08efe54dae35bdf343f22b...,money,4500.0
3,1,1,00:00:23,7f97d8c6790830a81669c2fc391c849f750b2ad7b056e1...,d928d5a8db727cd8dd9d504e34cb1606e556e3ba3f5865...,money,7500.0
4,1,1,00:00:26,cee3a04d387f99422402a1984eb8f002c4368dc78b4dba...,cb73cb11a224821423b269b8a95d111b056dd6777a946e...,money,4500.0


In [9]:
sou_acc = temp['source_acc_id'].unique()
tar_acc = temp['target_acc_id'].unique()
print('Number of Large Source : {}\nNumber of Large Target : {}'.format(len(sou_acc), len(tar_acc)))

print('Large Trader : {}'.format(len(set(sou_acc) | set(tar_acc))))

Number of Large Source : 27174
Number of Large Target : 29365
Large Trader : 32872


### 게임에서 레이드 분배금을 나눠가지거나 큰 금액을 거래하는 이들은 32,872개.
### 이 중 이후 판별될 auto 관리자 계정을 제거하면 게임을 열심히 즐기고 있는 유저로 생각 할 수 있다.

In [10]:
hard_list = pd.DataFrame(sorted(set(sou_acc) | set(tar_acc))).rename(columns={0:'acc_id'})
hard_list.head()

Unnamed: 0,acc_id
0,0002cb165b57f8ab2bea2e387a3bebcd1ee57b44229150...
1,0004788822873725de28052e042b1f88b29a8ed6326be6...
2,000af534394783c3a1d28414ce6adc95af45657e34bf70...
3,000c7d37960d5261d7680320227e07e53580f654cf0322...
4,000f75da470b53684a56be7c49083451cfa58db4573900...


In [11]:
hard_list.to_csv('hard_list.csv', index=False)