In [1]:
import pandas as pd
import numpy as np
from louvain import Louvain

In [2]:
edges_path = 'data/edges.csv'
ground_truth_path = 'data/ground_truth.csv'
submission_path = 'data/SubmissionSample.csv'

In [3]:
# get edges and nodes
edges = pd.read_csv(edges_path)
edges = edges.values
print('The total edges: {}'.format(edges.shape[0]))
print('The number of sourse nodes: {}'.format(len(np.unique(edges[:, 0]))))
print('The number of target nodes: {}'.format(len(np.unique(edges[:, 1]))))
print('The number of total nodes: {}'.format(len(np.unique(edges))))
nodes = np.unique(edges)

The total edges: 256426
The number of sourse nodes: 31092
The number of target nodes: 30949
The number of total nodes: 31136


In [4]:
# run the louvain algorithm
lv = Louvain()
lv.fit(edges, nodes)

<louvain.Louvain at 0x17961b06240>

In [5]:
communities = lv.predict(nodes)
# Note: By louvain algorithm, the final cluster may be not 5. 
# The final results may fluctuate a little. So the result may be not the same as submission in the Leaderboard.

In [6]:
print(len(np.unique(communities)))

6


In [7]:
for i in range(len(np.unique(communities))):
    com0 = np.where(communities==i)[0]
    print(len(com0))

3927
5286
10779
3927
7123
94


In [8]:
from collections import Counter
def convert_with_gt(communities, ground_truth):
    gt_col1 = ground_truth[:, 0].tolist()
    n_communities = np.unique(communities)
    communities_dict = dict([])
    for i in range(len(n_communities)):
        community = n_communities[i]
        nodes = np.where(communities==community)[0].tolist()
        communities_dict[community] = nodes
    res = {0: [], 1: [], 2: [], 3: [], 4: []}
    print(communities_dict.keys())
    for c in communities_dict.keys():
        nodes = communities_dict[c]
        gt = []
        for i in range(len(nodes)):
            node = nodes[i]
            if node in gt_col1:
                idx = np.where(node == ground_truth[:, 0])
                category = ground_truth[idx, 1][0][0]
                gt.append(category)
        gt = np.array(gt, dtype=np.int64)
        print(gt)
        if len(gt) == 0:
            category = 0
        else:
            category = Counter(gt).most_common()[0][0]
        print(category)
        res[category] = res[category] + nodes
    communities_gt = np.zeros((len(communities), ), dtype=communities.dtype)
    for c in res.keys():
        nodes = res[c]
        for node in nodes:
            communities_gt[node] = c  # get community
    return communities_gt

In [9]:
ground_truth = pd.read_csv(ground_truth_path)
ground_truth = ground_truth.values
communities_gt = convert_with_gt(communities, ground_truth)

dict_keys([0, 1, 2, 3, 4, 5])
[4 4 4 4 4 4 4 4 4 4]
4
[1 1 1 1 1 1 1 1 1 1]
1
[0 0 0 0 0 0 0 0 0 0]
0
[3 3 3 3 3 3 3 3 3 3]
3
[2 2 2 2 2 2 2 2 2 2]
2
[]
0


In [10]:
for i in [0, 1, 2, 3, 4]:
    com0 = np.where(communities_gt==i)[0]
    print(len(com0))

10873
5286
7123
3927
3927


In [11]:
df = pd.DataFrame(columns=['id', 'category'])
df['id'] = np.sort(nodes)
df['category'] = communities_gt
df.to_csv('data/submission_new.csv', index=None)

In [12]:
# compare to my best result, if the result differs a lot, rerun the louvain algorithm
com1 = pd.read_csv('data/submission_final.csv')
com2 = pd.read_csv('data/submission_new.csv')
com1 = com1.values[:, 1]
com2 = com2.values[:, 1]
print(np.sum(com1 == com2) / len(com2))

0.9981372045220966
