In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

You can get the dataset from https://densitydesign.github.io/strumentalia-seealsology/

__Steps to download:__

a) Enter the following links:

https://en.wikipedia.org/wiki/Space_research

https://en.wikipedia.org/wiki/Space_Race

https://en.wikipedia.org/wiki/Space_exploration

b) Download the TSV file.

In [2]:
df = pd.read_csv('F:/data/course-2/edges.csv')

In [3]:
import random
import string
 
def generate_random_str(randomlength=16):
  """
  生成一个指定长度的随机字符串，其中
  string.digits=0123456789
  string.ascii_letters=abcdefghigklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
  """
  str_list = [random.choice(string.digits + string.ascii_letters) for i in range(randomlength)]
  random_str = ''.join(str_list)
  return random_str

In [4]:
key_map = dict([])
for i in range(31136):
    key_map[i] = generate_random_str(15)

In [5]:
value = np.array(list(key_map.values()))
len(np.unique(value))

31136

In [6]:
df.head()

Unnamed: 0,source,target
0,0,2
1,0,1
2,1,0
3,7,8
4,8,7


In [7]:
df['source'] = df['source'].astype('object')
df['target'] = df['target'].astype('object')

In [8]:
print(df.dtypes)

source    object
target    object
dtype: object


In [9]:
df.shape
edges = df.values
for i in range(len(edges)):
    edges[i, 0] = key_map[edges[i, 0]]
    edges[i, 1] = key_map[edges[i, 1]]
print(edges.dtype)
new_df = pd.DataFrame(columns=['source', 'target'])
new_df['source'] = edges[:, 0]
new_df['target'] = edges[:, 1]
new_df.head()

object


Unnamed: 0,source,target
0,W87O1XyaLZC0b8Q,CBE0Py0IGMGI5uu
1,W87O1XyaLZC0b8Q,vclniGH7o0Hhf5p
2,vclniGH7o0Hhf5p,W87O1XyaLZC0b8Q
3,DRJmfVcgO24PIkm,T8bDqqVJbwTYf2Q
4,T8bDqqVJbwTYf2Q,DRJmfVcgO24PIkm


In [10]:
print(new_df.dtypes)

source    object
target    object
dtype: object


In [11]:
def _create_graph(edges, nodes):
        """
            Create graph from the edges matrix and nodes vectors.

            Args:
                edges(np.ndarray): shape of (n_edges, 2) or shape of (n_edges, 3).
                nodes(np.ndarray): shape of (n_nodes, ).
            Returns:
                graph(networkx.Graph): a networkx Graph instance.
            """
        with_weight = edges.shape[1] == 3  # the 3rd dimension is edge weight.
        graph = nx.Graph()
        graph.add_nodes_from(nodes.tolist())
        for edge_i in edges:
            node1 = edge_i[0]
            node2 = edge_i[1]
            if with_weight:
                weight = edge_i[2]
                graph.add_edge(node1, node2, weight)
            else:
                graph.add_edge(node1, node2)
        return graph

In [12]:
nodes = np.unique(edges)
print(len(nodes))
G = _create_graph(edges, nodes)

31136


In [13]:
from node2vec import Node2Vec

In [27]:
node2vec = Node2Vec(G, dimensions=100, walk_length=20, num_walks=10, workers=4)

Computing transition probabilities: 100%|███████████████████████████████████████| 31136/31136 [01:05<00:00, 474.35it/s]


In [28]:
model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [12]:
# construct an undirected graph
G=nx.from_pandas_edgelist(new_df, "source", "target", edge_attr=True, create_using=nx.Graph())

In [14]:
len(G) # number of nodes

31136

In [15]:
# function to generate random walk sequences of nodes
def get_randomwalk(node, path_length):
    
    random_walk = [node]
    
    for i in range(path_length-1):
        temp = list(G.neighbors(node))
        temp = list(set(temp) - set(random_walk))    
        if len(temp) == 0:
            break

        random_node = random.choice(temp)
        random_walk.append(random_node)
        node = random_node
        
    return random_walk

In [28]:
get_randomwalk('kOZWLuxXQsOqCSY', 10)

['kOZWLuxXQsOqCSY',
 'JdFiRhSs4Yl5tvc',
 'KFf8n77getzlIMo',
 'fu1t5i4wDijMUmv',
 'FKuQ5RyM8N5VHIR',
 'fm4wtP2zJpgd7IH',
 'ircAuiGTNZRMsAo',
 '9PhElRJYuh7ni1m',
 'ytQz1XNpRe4mznj',
 'gCLlkRazpGjNfUR']

In [16]:
all_nodes = list(G.nodes())

random_walks = []

for n in tqdm(all_nodes):
    for i in range(5):
        random_walks.append(get_randomwalk(n,10))

100%|██████████████████████████████████████████████████████████████████████████| 31136/31136 [00:09<00:00, 3222.79it/s]


In [17]:
# count of sequences
len(random_walks)

155680

In [18]:
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings('ignore')

In [19]:
# train word2vec model
model = Word2Vec(window = 4, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(random_walks, progress_per=2)

In [20]:
model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)

(31135180, 31135180)

In [29]:
print(model)

Word2Vec(vocab=31136, size=100, alpha=0.025)


In [227]:
# find top n similar nodes
model.similar_by_word('astronaut training')

[('reduced-gravity aircraft', 0.9756266474723816),
 ('micro-g environment', 0.9612352252006531),
 ('spaceflight osteopenia', 0.8710659742355347),
 ('microgravity university', 0.8698078393936157),
 ('space flight participant', 0.8578461408615112),
 ('space adaptation syndrome', 0.8436012268066406),
 ('space tourism society', 0.8100888729095459),
 ('lagrange point colonization', 0.7876768112182617),
 ('stanford torus', 0.7843056321144104),
 ('lists of space programs', 0.7734896540641785)]

In [228]:
terms = ['lunar escape systems','soviet moonshot', 'soyuz 7k-l1', 'moon landing',
         'space food', 'food systems on space exploration missions', 'meal, ready-to-eat',
         'space law', 'metalaw', 'moon treaty', 'legal aspects of computing',
         'astronaut training', 'reduced-gravity aircraft', 'space adaptation syndrome', 'micro-g environment']

In [18]:
terms = list(key_map.values())

In [36]:
def plot_nodes(word_list):
    X = model[word_list]
    
    # reduce dimensions to 2
    pca = PCA(n_components=2)
    result = pca.fit_transform(X)
    print(result)
    
    
    plt.figure(figsize=(12,9))
    # create a scatter plot of the projection
    plt.scatter(result[:, 0], result[:, 1])    
    plt.show()

In [30]:
X = model[terms]
print(X)

  """Entry point for launching an IPython kernel.


[[-0.51724154  0.15543737 -0.40485814 ... -0.1350585  -0.79905295
  -0.44472983]
 [-0.24738924 -0.01353899 -0.35022303 ... -0.34508088 -0.96476346
  -0.11683018]
 [ 0.10339013  0.2314968  -0.5118003  ...  0.2793765   0.093804
  -0.00935603]
 ...
 [ 0.5890967  -0.07106898 -0.01868995 ...  0.03065697  0.1506573
   0.77508956]
 [ 0.6838051  -0.05078315  0.19423343 ...  0.07558585  0.03166173
   0.6298917 ]
 [ 0.5898434  -0.19957133 -0.01377609 ...  0.04623912  0.36336508
   0.85478103]]


In [31]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
print(kmeans.labels_)

[4 4 1 ... 3 3 3]


In [21]:
from collections import Counter
def convert_with_gt(communities, ground_truth):
    gt_col1 = ground_truth[:, 0].tolist()
    n_communities = np.unique(communities)
    communities_dict = dict([])
    for i in range(len(n_communities)):
        community = n_communities[i]
        nodes = np.where(communities==community)[0].tolist()
        communities_dict[community] = nodes
    res = {0: [], 1: [], 2: [], 3: [], 4: []}
    print(communities_dict.keys())
    for c in communities_dict.keys():
        nodes = communities_dict[c]
        gt = []
        for i in range(len(nodes)):
            node = nodes[i]
            if node in gt_col1:
                idx = np.where(node == ground_truth[:, 0])
                category = ground_truth[idx, 1][0][0]
                gt.append(category)
        gt = np.array(gt, dtype=np.int64)
        print(gt)
        if len(gt) == 0:
            category = 2
        else:
            category = Counter(gt).most_common()[0][0]
        print(category)
        res[category] = res[category] + nodes
    communities_gt = np.zeros((len(communities), ), dtype=communities.dtype)
    for c in res.keys():
        nodes = res[c]
        for node in nodes:
            communities_gt[node] = c  # get community
    return communities_gt

In [23]:
communities = kmeans.labels_

In [32]:
ground_truth_path = 'data/ground_truth.csv'
ground_truth = pd.read_csv(ground_truth_path)
ground_truth = ground_truth.values
communities_gt = convert_with_gt(communities, ground_truth)

dict_keys([0, 1, 2, 3, 4])
[2 2 2 2 2 2 2 2 2 2]
2
[1 1 1 1 1 1 1 1 1 1]
1
[0 0 0 0 0 0 0 0 0 0]
0
[4 4 4 4 4 4 4 4 4 4]
4
[3 3 3 3 3 3 3 3 3 3]
3


In [33]:
df = pd.DataFrame(columns=['id', 'category'])
nodes = np.array(list(range(len(nodes))))
df['id'] = np.sort(nodes)
df['category'] = communities_gt
df.to_csv('data/submission6.csv', index=None)

In [34]:
com2 = pd.read_csv('data/submission4.csv')
com3 = pd.read_csv('data/submission6.csv')
com2 = com2.values[:, 1]
com3 = com3.values[:, 1]
print(np.sum(com2 == com3) / len(com2))

0.9981693216855088
