### LastFM graph
The graph can downloaded from https://github.com/benedekrozemberczki/datasets

In [1]:
import os
import networkx as nx
import pandas as pd
import numpy as np
import json
import copy
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
graphname = 'LastFM'
data_dir = os.path.expanduser("../Graphs/" + graphname)

edges_df = pd.read_csv(data_dir + '/lastfm_asia_edges.csv') 
labels_df = pd.read_csv(data_dir + '/lastfm_asia_target.csv')

In [29]:
edges_df.shape, labels_df.shape

((27806, 2), (7624, 2))

In [30]:
with open(data_dir + '/lastfm_asia_features.json') as f:
    features = json.load(f)

In [31]:
words = set()
for node, feats in features.items():
    # print(node, len(feats))
    words = words.union(feats)
print(len(features), len(words))

7624 7842


In [32]:
edges_df.head()

Unnamed: 0,node_1,node_2
0,0,747
1,1,4257
2,1,2194
3,1,580
4,1,6478


In [33]:
labels_df.head()

Unnamed: 0,id,target
0,0,8
1,1,17
2,2,3
3,3,17
4,4,5


In [34]:
nodedata = {}
for idx, row in labels_df.iterrows():
    nodedata[str(row['id'])] = ('label=' + str(row['target']), {})
    if len(nodedata) % 1000 == 0:
        print(len(nodedata))
    if len(features[str(row['id'])]) == 0:
        nodedata[str(row['id'])][1]['None'] = 1
    for word in features[str(row['id'])]:
        nodedata[str(row['id'])][1][word] = 1

1000
2000
3000
4000
5000
6000
7000


In [35]:
jsonpath = data_dir + "/data/nodedata.json" #os.path.join(data_dir, "/data/nodedata.json")
print(jsonpath)
with open(jsonpath, 'w') as outfile:
    json.dump(nodedata, outfile)

../Graphs/LastFM/data/nodedata.json


In [42]:
labels_df.rename(columns={'id': 'node', 'target': 'label'}).to_csv(data_dir + \
                                                        '/data/nodes_with_labels.csv', index=False)

In [36]:
nodes = set()
labels = set()
word_indices = set()
for node, features in nodedata.items():
    nodes.add(node)
    labels.add(features[0])
    for w in features[1]:
        word_indices.add(str(w))
        
nodes_path = data_dir + "/data/graph_nodes.txt"#os.path.join(data_dir, "graph_nodes.txt")
with open(nodes_path, 'w') as outfile:
    for node in nodes:
        outfile.write(str(node) + '\n')
        
labels_path = data_dir + "/data/labels.txt" #os.path.join(data_dir, "labels.txt")
with open(labels_path, 'w') as outfile:
    for label in labels:
        outfile.write(label + '\n')
        
words_path = data_dir + "/data/words_indices.txt" # os.path.join(data_dir, "words_indices.txt")
with open(words_path, 'w') as outfile:
    for wi in word_indices:
        outfile.write(wi + '\n')
        
edges = []
for idx, row in edges_df.iterrows():
    edges.append((str(row['node_1']).strip(), str(row['node_2']).strip()))

In [37]:
edges = []
for idx, row in edges_df.iterrows():
    edges.append((str(row['node_1']).strip(), str(row['node_2']).strip()))
    
G = nx.Graph()
for edge in edges:
    u = edge[0]
    v = edge[1]
    if u in nodedata and v in nodedata:
        G.add_edge(u, v)

In [38]:
G.number_of_nodes(), G.number_of_edges()

(7624, 27806)

In [40]:
edges_path = data_dir + "/data/all_graph_edges.txt" 
with open(edges_path, 'w') as outfile:
    for edge in G.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')

In [39]:
largest_cc = max(nx.connected_components(G), key=len)
CC = G.subgraph(largest_cc).copy()
nx.algorithms.distance_measures.diameter(CC)

KeyboardInterrupt: 

In [25]:
removed_edges = set()
H = copy.deepcopy(G)
threshold = 0.2
while len(removed_edges) < threshold*G.number_of_edges():
    if len(removed_edges)%500 == 0:
        print(len(removed_edges), G.number_of_edges())
    i = np.random.randint(0, H.number_of_edges()-1) #get_rnd_int_in_range(stream, 0, H.number_of_edges()-1)
    edge = list(H.edges())[i]
    u = edge[0]
    v = edge[1]
    if H.degree[u] > 1 and H.degree[v] > 1:
        H.remove_edge(u, v)
        removed_edges.add((u, v))

0 27806


KeyboardInterrupt: 