# Party Feature engineering - network 만들기

In [2]:
from tqdm import tqdm
from tqdm import tqdm_notebook

import warnings
warnings.simplefilter('ignore')

import pickle
from scipy import sparse

In [3]:
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

# Train

## 1. 데이터 불러오기

In [3]:
label = pd.read_csv("~/documents/chaser_data/train_label.csv")

In [4]:
%%time
party_filtered = pd.read_csv("~/documents/chaser_data/train_party_filtered.csv", memory_map=True)

CPU times: user 17.6 s, sys: 2.83 s, total: 20.4 s
Wall time: 26.6 s


In [5]:
len(party_filtered)

3355480

## 2. party에 참여한 acc_id 구하기

### 2.1 party members acc id 전체를 리스트로 만들기

#### (1) get_party_ids 함수
- party members acc id 전체가 들어간 리스트를 뽑아내는 함수

In [7]:
def get_party_ids(df):
    party_id = df["party_members_acc_id"].tolist()
    party_id = [x.split(',') for x in party_id]
    party_id = [item for sublist in party_id for item in sublist]
    return party_id

In [8]:
%%time
party_ids = get_party_ids(party_filtered)

CPU times: user 9.95 s, sys: 954 ms, total: 10.9 s
Wall time: 10.9 s


#### (2) party에 참여한 id 수

In [9]:
party_unique_ids = list(set(party_ids))
pickle.dump(party_unique_ids, open("party_unique_ids.pkl", "wb"))

In [10]:
party_unique_ids = pickle.load(open("data/party_unique_ids.pkl", "rb"))
print("party에 참여한 id 수(중복카운트):", len(party_ids))
print(len(list(set(party_ids))))
print("party에 참여한 id 수(중복 없음):", len(party_unique_ids))

party에 참여한 id 수(중복카운트): 20781407
223621
party에 참여한 id 수(중복 없음): 223621


## 3. acc_id가 다른 id와 함께 파티한 횟수 구하기

### 3.1 함수 만들기

#### (1) get_party_relation(): 한 id의 party 관계 리스트로 구하기
- 한 유저(a)가 다른 유저(b)와 파티를 몇 회(n) 함께 했는지 (a, b, n)의 리스트를 받는 함수 작성

In [2]:
def get_party_relation(base_id):
    '''
    input: base_id (network를 구하고 싶은 기준 아이디)
    output: relations list((기준id, 함께한id, 함께한 횟수)의 리스트 형태)
    '''
    # 기준 id가 참여한 party member 리스트 뽑기 (party_id는 이중리스트 형태)
    with_members = list(filter(lambda a: base_id in a, party_id))
    
    # 이중 리스트인 with_members를 flat list로 풀어주기
    with_members = [item for sublist in with_members for item in sublist]
    
    # 기준id 리스트에서 빼기
    with_members = list(filter(lambda a: a != base_id, with_members))
    
    # 함께한 횟수 df로 구하기
    df_party_id = pd.DataFrame(with_members, columns=["acc_id"])
    df_party_id = df_party_id.groupby('acc_id').size().reset_index(name='party_cnt')
    
    # relation a, b, w로 받기
    a = [base_id] * len(df_party_id)
    b = df_party_id["acc_id"].tolist()
    w = df_party_id["party_cnt"].tolist()
    
    # a, b, w의 list 받기
    relations = list(zip(a, b, w))
    return relations

#### (2) 1만개씩 relation 리스트 만들어 저장하기

- 컴퓨터 리소스를 고려, network relation 리스트를 1만개씩 받도록 함

In [13]:
def relation(num):
    relations = []
    
    for i in tqdm(range(10000*num, 10000*(num+1))):
        relations.append(get_party_relation(party_unique_ids[i]))
        
    pickle.dump(relations, open("data/party_relations_{}.pkl".format(num), "wb"))

#### (3) get_network(): relations로 네트워크 만들기
- networkx 패키지 이용해서 relation으로부터 네트워크 그래프 만들기

In [10]:
def get_network(ls):
    for i in range(len(ls)):
        G.add_edge(ls[i][0], ls[i][1], weight = ls[i][2])

### 3.2 네트워크 그래프 만들기

#### (1) relation list 만들어서 저장하기

In [14]:
# party member 전체 리스트 미리 받기
party_id = party_filtered["party_members_acc_id"].tolist()
party_id = [x.split(',') for x in party_id]

In [14]:
for idx in range(5):
    relation(idx)

100%|██████████| 10000/10000 [3:20:40<00:00,  1.20s/it]
100%|██████████| 10000/10000 [2:09:02<00:00,  1.29it/s]
100%|██████████| 10000/10000 [3:08:01<00:00,  1.13s/it]
100%|██████████| 10000/10000 [2:01:35<00:00,  1.37it/s]
100%|██████████| 10000/10000 [2:00:49<00:00,  1.38it/s]


In [14]:
for idx in range(5,10):
    relation(idx)

100%|██████████| 10000/10000 [3:20:40<00:00,  1.20s/it]
100%|██████████| 10000/10000 [2:09:02<00:00,  1.29it/s]
100%|██████████| 10000/10000 [3:08:01<00:00,  1.13s/it]
100%|██████████| 10000/10000 [2:01:35<00:00,  1.37it/s]
100%|██████████| 10000/10000 [2:00:49<00:00,  1.38it/s]


In [16]:
for idx in range(10,15):
    relation(idx)

100%|██████████| 10000/10000 [4:00:15<00:00,  1.44s/it]
100%|██████████| 10000/10000 [9:16:37<00:00,  3.34s/it]
100%|██████████| 10000/10000 [4:06:22<00:00,  1.48s/it]
100%|██████████| 10000/10000 [4:00:59<00:00,  1.45s/it]
100%|██████████| 10000/10000 [3:12:46<00:00,  1.16s/it]


In [14]:
for idx in range(15,20):
    relation(idx)

100%|██████████| 10000/10000 [3:20:40<00:00,  1.20s/it]
100%|██████████| 10000/10000 [2:09:02<00:00,  1.29it/s]
100%|██████████| 10000/10000 [3:08:01<00:00,  1.13s/it]
100%|██████████| 10000/10000 [2:01:35<00:00,  1.37it/s]
100%|██████████| 10000/10000 [2:00:49<00:00,  1.38it/s]


In [15]:
relation(20)

100%|██████████| 10000/10000 [2:17:32<00:00,  1.21it/s]


In [15]:
relation(21)

100%|██████████| 10000/10000 [2:20:23<00:00,  1.19it/s]


In [16]:
relations = []

for i in tqdm(range(220000, len(party_unique_ids))):
    relations.append(get_party_relation(party_unique_ids[i]))

pickle.dump(relations, open("party_relations_22.pkl", "wb"))

100%|██████████| 3621/3621 [1:29:02<00:00,  1.48s/it]


#### (2) relation list 불러와서 network graph 만들기

In [19]:
def network(num):
    party_relations = pickle.load(open("data/party_relations_{}.pkl".format(num), "rb"))
    print("length of party_relations_{}:".format(num), len(party_relations))
    for relation in tqdm(party_relations):
        get_network(relation)

In [5]:
G = nx.Graph()

In [6]:
for num in range(5):
    network(num)

  3%|▎         | 297/10000 [00:00<00:03, 2884.00it/s]

length of party_relations_0: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1960.13it/s]
  2%|▏         | 187/10000 [00:00<00:05, 1852.99it/s]

length of party_relations_1: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1899.05it/s]
  2%|▏         | 212/10000 [00:00<00:04, 2062.84it/s]

length of party_relations_2: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1899.85it/s]
  5%|▍         | 465/10000 [00:00<00:04, 2318.13it/s]

length of party_relations_3: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1981.69it/s]
  2%|▏         | 180/10000 [00:00<00:05, 1757.64it/s]

length of party_relations_4: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1969.59it/s]


In [7]:
for num in range(5,10):
    network(num)

  2%|▏         | 170/10000 [00:00<00:06, 1612.83it/s]

length of party_relations_5: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1863.16it/s]
  1%|          | 123/10000 [00:00<00:09, 1095.13it/s]

length of party_relations_6: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1527.70it/s]
  1%|▏         | 149/10000 [00:00<00:06, 1473.44it/s]

length of party_relations_7: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1807.12it/s]
  2%|▏         | 150/10000 [00:00<00:06, 1467.62it/s]

length of party_relations_8: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1769.65it/s]
  2%|▏         | 169/10000 [00:00<00:06, 1631.06it/s]

length of party_relations_9: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1751.24it/s]


In [8]:
for num in range(10,15):
    network(num)

  1%|▏         | 149/10000 [00:00<00:06, 1430.88it/s]

length of party_relations_10: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1757.18it/s]
  4%|▎         | 367/10000 [00:00<00:05, 1834.51it/s]

length of party_relations_11: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1698.50it/s]
  4%|▎         | 361/10000 [00:00<00:05, 1799.38it/s]

length of party_relations_12: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1701.80it/s]
  3%|▎         | 309/10000 [00:00<00:06, 1533.15it/s]

length of party_relations_13: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1606.14it/s]
  2%|▏         | 169/10000 [00:00<00:06, 1621.43it/s]

length of party_relations_14: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1624.07it/s]


In [9]:
for num in range(15,20):
    network(num)

  3%|▎         | 298/10000 [00:00<00:06, 1477.26it/s]

length of party_relations_15: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1594.68it/s]
  1%|          | 118/10000 [00:00<00:08, 1136.62it/s]

length of party_relations_16: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1562.89it/s]
  3%|▎         | 252/10000 [00:00<00:07, 1257.97it/s]

length of party_relations_17: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1445.63it/s]
  1%|          | 93/10000 [00:00<00:10, 909.32it/s]

length of party_relations_18: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1452.65it/s]
  1%|▏         | 138/10000 [00:00<00:07, 1334.20it/s]

length of party_relations_19: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1521.39it/s]


In [10]:
for num in range(20,23):
    network(num)

  1%|          | 90/10000 [00:00<00:11, 845.70it/s]

length of party_relations_20: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1473.86it/s]
  3%|▎         | 326/10000 [00:00<00:05, 1627.61it/s]

length of party_relations_21: 10000


100%|██████████| 10000/10000 [00:06<00:00, 1474.10it/s]
  8%|▊         | 280/3621 [00:00<00:02, 1393.59it/s]

length of party_relations_22: 3621


100%|██████████| 3621/3621 [00:02<00:00, 1439.14it/s]


- 네트워크 구성 결과 node의 수가 2명 이상짜리 파티에 참여한 unique id 수와 동일함

In [11]:
len(G.nodes())

223613

In [15]:
# network pickle 파일로 저장하기
nx.write_gpickle(G, "data/train_party_network.gpickle")
G = nx.read_gpickle("data/train_party_network.gpickle")

In [12]:
# network를 sparse matrix로 저장하고 file로 export
S = nx.to_scipy_sparse_matrix(G, nodelist=G.nodes())
sparse.save_npz("data/party_network_train.npz", S)

In [14]:
# sparse matrix 불러오기
sparse.load_npz("data/party_network_train.npz")

<223613x223613 sparse matrix of type '<class 'numpy.int64'>'
	with 38735942 stored elements in Compressed Sparse Row format>

----

# Test

## 1. 데이터 불러오기
- 메모리 관계상 party_id와 party_unique_ids 리스트를 train과 같은 방법으로 따로 만들어 pickle 파일로 불러옴

## 2. party에 참여한 acc_id 불러오기

In [4]:
party_id = pickle.load(open("data/party_id_test.pkl", "rb"))

In [5]:
party_unique_ids = pickle.load(open("data/party_unique_ids_test.pkl", "rb"))
print("party에 참여한 id 수(중복카운트):", len(party_id))
print("party에 참여한 id 수(중복 없음):", len(party_unique_ids))

party에 참여한 id 수(중복카운트): 2192953
party에 참여한 id 수(중복 없음): 192124


## 3. acc_id가 다른 id와 함께 파티한 횟수 구하기

### 3.1 함수 만들기

#### (1) 10000개씩 리스트 만들어 저장하는 함수 수정

In [6]:
def relation(num):
    relations = []
    
    for i in tqdm(range(10000*num, 10000*(num+1))):
        relations.append(get_party_relation(party_unique_ids[i]))
        
    pickle.dump(relations, open("data/party_relations_test_{}.pkl".format(num), "wb"))

### 3.2 네트워크 그래프 만들기

#### (1) relation list 만들어서 저장하기

In [14]:
for idx in range(5):
    relation(idx)

100%|██████████| 10000/10000 [3:20:40<00:00,  1.20s/it]
100%|██████████| 10000/10000 [2:09:02<00:00,  1.29it/s]
100%|██████████| 10000/10000 [3:08:01<00:00,  1.13s/it]
100%|██████████| 10000/10000 [2:01:35<00:00,  1.37it/s]
100%|██████████| 10000/10000 [2:00:49<00:00,  1.38it/s]


In [14]:
for idx in range(5,10):
    relation(idx)

100%|██████████| 10000/10000 [3:20:40<00:00,  1.20s/it]
100%|██████████| 10000/10000 [2:09:02<00:00,  1.29it/s]
100%|██████████| 10000/10000 [3:08:01<00:00,  1.13s/it]
100%|██████████| 10000/10000 [2:01:35<00:00,  1.37it/s]
100%|██████████| 10000/10000 [2:00:49<00:00,  1.38it/s]


In [16]:
for idx in range(10,15):
    relation(idx)

100%|██████████| 10000/10000 [4:00:15<00:00,  1.44s/it]
100%|██████████| 10000/10000 [9:16:37<00:00,  3.34s/it]
100%|██████████| 10000/10000 [4:06:22<00:00,  1.48s/it]
100%|██████████| 10000/10000 [4:00:59<00:00,  1.45s/it]
100%|██████████| 10000/10000 [3:12:46<00:00,  1.16s/it]


In [6]:
relation(15)

100%|██████████| 10000/10000 [1:29:39<00:00,  1.86it/s]


In [6]:
relation(16)

100%|██████████| 10000/10000 [1:29:39<00:00,  1.86it/s]


In [6]:
relation(17)

100%|██████████| 10000/10000 [1:29:39<00:00,  1.86it/s]


In [6]:
relation(18)

100%|██████████| 10000/10000 [1:29:39<00:00,  1.86it/s]


In [15]:
relations = []

for i in tqdm(range(190000, len(party_unique_ids))):
    relations.append(get_party_relation(party_unique_ids[i]))

pickle.dump(relations, open("party_relations_test_19.pkl", "wb"))

100%|██████████| 2124/2124 [28:30<00:00,  1.24it/s]


#### (2) relation list 불러와서 network graph 만들기

In [7]:
def network(num):
    party_relations = pickle.load(open("data/party_relations_test_{}.pkl".format(num), "rb"))
    print("length of party_relations_{}:".format(num), len(party_relations))
    for relation in tqdm(party_relations):
        get_network(relation)

In [8]:
G = nx.Graph()

In [11]:
for num in range(5):
    network(num)

  2%|▏         | 246/10000 [00:00<00:03, 2457.96it/s]

length of party_relations_0: 10000


100%|██████████| 10000/10000 [00:04<00:00, 2306.56it/s]
  4%|▍         | 435/10000 [00:00<00:04, 2166.21it/s]

length of party_relations_1: 10000


100%|██████████| 10000/10000 [00:04<00:00, 2240.39it/s]
  2%|▏         | 232/10000 [00:00<00:04, 2319.53it/s]

length of party_relations_2: 10000


100%|██████████| 10000/10000 [00:04<00:00, 2116.17it/s]
  5%|▍         | 456/10000 [00:00<00:04, 2274.48it/s]

length of party_relations_3: 10000


100%|██████████| 10000/10000 [00:04<00:00, 2255.39it/s]
  5%|▌         | 522/10000 [00:00<00:03, 2605.13it/s]

length of party_relations_4: 10000


100%|██████████| 10000/10000 [00:04<00:00, 2048.99it/s]


In [12]:
for num in range(5,10):
    network(num)

  4%|▍         | 381/10000 [00:00<00:05, 1886.36it/s]

length of party_relations_5: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1968.07it/s]
  5%|▍         | 457/10000 [00:00<00:04, 2252.22it/s]

length of party_relations_6: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1908.47it/s]
  5%|▍         | 481/10000 [00:00<00:03, 2398.32it/s]

length of party_relations_7: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1903.02it/s]
  2%|▏         | 235/10000 [00:00<00:04, 2259.11it/s]

length of party_relations_8: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1904.52it/s]
  2%|▏         | 168/10000 [00:00<00:05, 1668.75it/s]

length of party_relations_9: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1947.01it/s]


In [13]:
for num in range(10,15):
    network(num)

  2%|▏         | 233/10000 [00:00<00:04, 2325.10it/s]

length of party_relations_10: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1929.49it/s]
  2%|▏         | 181/10000 [00:00<00:05, 1794.60it/s]

length of party_relations_11: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1795.75it/s]
  2%|▏         | 175/10000 [00:00<00:05, 1743.55it/s]

length of party_relations_12: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1883.51it/s]
  2%|▏         | 222/10000 [00:00<00:04, 2217.98it/s]

length of party_relations_13: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1737.81it/s]
  3%|▎         | 325/10000 [00:00<00:06, 1607.76it/s]

length of party_relations_14: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1730.92it/s]


In [14]:
for num in range(15,20):
    network(num)

  2%|▏         | 216/10000 [00:00<00:04, 2157.33it/s]

length of party_relations_15: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1734.83it/s]
  2%|▏         | 209/10000 [00:00<00:04, 2087.83it/s]

length of party_relations_16: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1783.25it/s]
  2%|▏         | 200/10000 [00:00<00:04, 1961.25it/s]

length of party_relations_17: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1784.45it/s]
  3%|▎         | 251/10000 [00:00<00:04, 2382.26it/s]

length of party_relations_18: 10000


100%|██████████| 10000/10000 [00:05<00:00, 1712.90it/s]
  9%|▊         | 184/2124 [00:00<00:01, 1828.43it/s]

length of party_relations_19: 2124


100%|██████████| 2124/2124 [00:01<00:00, 1781.88it/s]


- 네트워크 구성 결과 node의 수가 2명 이상짜리 파티에 참여한 unique id 수와 동일함

In [15]:
len(G.nodes())

192120

In [16]:
# network pickle 파일로 저장하기
nx.write_gpickle(G, "data/test_party_network.gpickle")
# G = nx.read_gpickle("data/train_party_network.gpickle")

In [21]:
# network를 sparse matrix로 저장하고 file로 export
S = nx.to_scipy_sparse_matrix(G, nodelist=G.nodes())

In [24]:
sparse.save_npz("data/party_network_test.npz", S)

In [25]:
# sparse matrix 불러오기
sparse.load_npz("data/party_network_test.npz")

<192120x192120 sparse matrix of type '<class 'numpy.int64'>'
	with 30121996 stored elements in Compressed Sparse Row format>