In [1]:
edges_path = 'data/edges.csv'
ground_truth_path = 'data/ground_truth.csv'
submission_path = 'data/SubmissionSample.csv'

In [2]:
from deepwalk import DeepWalk
import pandas as pd
import numpy as np

In [5]:
# get edges and nodes
edges = pd.read_csv(edges_path)
edges = edges.values
nodes = np.unique(edges)

In [3]:
# This model will generate result close to the best result in the LeaderBoard
dw = DeepWalk(random_policy='bias_random', dimensions=64, path_length=20, num_random=10, workers=4)
dw.fit(edges_path)

Creating the graph has finished...


Computing transition probabilities: 100%|███████████████████████████████████████| 31136/31136 [00:58<00:00, 532.01it/s]


Begin to generate random walks...
Already get all random walks, Number of random walks: 311360
Begin to train the model...
Model training has finished...


<deepwalk.DeepWalk at 0x2ba86152550>

In [18]:
# This model can run faster, but accuracy may be lower
# increase num_random or path_length will get better result
# dw = DeepWalk(num_random=5, random_policy='random', epochs=20, path_length=10, workers=4)
# dw.fit(edges_path)

Creating the graph has finished...


100%|██████████████████████████████████████████████████████████████████████████| 31136/31136 [00:10<00:00, 3028.94it/s]


Already get all random walks, Number of random walks: 155680
Begin to train the model...
Model training has finished...


<deepwalk.DeepWalk at 0x1e61b9947b8>

In [6]:
communities = dw.predict(nodes)

In [7]:
print(communities)

[0 0 3 ... 1 1 1]


In [8]:
from collections import Counter
def convert_with_gt(communities, ground_truth):
    gt_col1 = ground_truth[:, 0].tolist()
    n_communities = np.unique(communities)
    communities_dict = dict([])
    for i in range(len(n_communities)):
        community = n_communities[i]
        nodes = np.where(communities==community)[0].tolist()
        communities_dict[community] = nodes
    res = {0: [], 1: [], 2: [], 3: [], 4: []}
    print(communities_dict.keys())
    for c in communities_dict.keys():
        nodes = communities_dict[c]
        gt = []
        for i in range(len(nodes)):
            node = nodes[i]
            if node in gt_col1:
                idx = np.where(node == ground_truth[:, 0])
                category = ground_truth[idx, 1][0][0]
                gt.append(category)
        gt = np.array(gt, dtype=np.int64)
        print(gt)
        if len(gt) == 0:
            category = 2
        else:
            category = Counter(gt).most_common()[0][0]
        print(category)
        res[category] = res[category] + nodes
    communities_gt = np.zeros((len(communities), ), dtype=communities.dtype)
    for c in res.keys():
        nodes = res[c]
        for node in nodes:
            communities_gt[node] = c  # get community
    return communities_gt

In [9]:
ground_truth = pd.read_csv(ground_truth_path)
ground_truth = ground_truth.values
communities_gt = convert_with_gt(communities, ground_truth)

dict_keys([0, 1, 2, 3, 4])
[4 4 4 4 4 4 4 4 4 4]
4
[3 3 3 3 3 3 3 3 3 3]
3
[2 2 2 2 2 2 2 2 2 2]
2
[1 1 1 1 1 1 1 1 1 1]
1
[0 0 0 0 0 0 0 0 0 0]
0


In [10]:
df = pd.DataFrame(columns=['id', 'category'])
nodes = np.array(list(range(len(nodes))))
df['id'] = np.sort(nodes)
df['category'] = communities_gt
df.to_csv('data/submission_new.csv', index=None)

In [11]:
# Compare to my best result
com2 = pd.read_csv('data/submission_final.csv')
com3 = pd.read_csv('data/submission_new.csv')
com2 = com2.values[:, 1]
com3 = com3.values[:, 1]
print(np.sum(com2 == com3) / len(com2))

0.9991649537512847


In [12]:
com = communities_gt.tolist()
com_dict = {0: [], 1:[], 2:[], 3:[], 4:[]}
for i, c in enumerate(com):
    com_dict[c].append(i)
for c in com_dict.keys():
    print(len(com_dict[c]))

10858
5285
7129
3927
3937
