## DeezerEurope and GitWebML graphs
The graphs can be downloaded from https://github.com/benedekrozemberczki/datasets

In [1]:
import os
import networkx as nx
import pandas as pd
import numpy as np
import json
import copy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer

In [2]:
idx=2
graphnames = ['DeezerEurope', 'GitWebML']
names = ['deezer_europe', 'musae_git']
data_dir = os.path.expanduser("../Graphs/" + graphnames[idx] + '/')

edges_df = pd.read_csv(data_dir + names[idx] + '_edges.csv') 
labels_df = pd.read_csv(data_dir + names[idx] + '_target.csv')

In [3]:
edges_df.shape, labels_df.shape

((27806, 2), (7624, 2))

In [4]:
with open(data_dir + names[idx] + '_features.json') as f:
    features = json.load(f)

In [5]:
key = list(features.keys())[0]
print(features[key][:10])

[2964, 3900, 3902, 2402, 6185, 509, 7627, 3389, 2407, 5]


In [7]:
words = set()
nodes = []
corpus = []
for node, feats in features.items():
    # print(node, len(feats))
    words = words.union([str(f) for f in feats])
    nodes.append(node)
    corpus.append(' '.join(['x'+str(f) for f in feats]))
print(len(features), len(words))

7624 7842


In [8]:
corpus[0][:100]

'x2964 x3900 x3902 x2402 x6185 x509 x7627 x3389 x2407 x5 x4403 x3633 x5875 x3395 x3531 x6908 x3202 x5'

In [10]:
vectorizer = TfidfVectorizer()
vectorizer._validate_vocabulary()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

In [11]:
X.shape

(7624, 7842)

In [19]:
tf_idf_weights = {}
for i in range(X.shape[0]):
    if i % 3000 == 0:
        print(i)
    feature_index = X[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X[i, x] for x in feature_index])
    weights = {}
    for idx, weight in tfidf_scores:
        weights[feature_names[idx]] = weight**3
    tf_idf_weights[nodes[i]] = weights

0
3000
6000


In [21]:
len(features['0']), len(tf_idf_weights['0'])

(477, 477)

In [22]:
for f in features['0']:
    print(tf_idf_weights['0']['x'+str(f)])

2.6877655494228826e-05
5.6337862836952446e-05
0.0002787554627035134
4.93130422150688e-05
0.00018635394104119367
2.6543482272715056e-05
5.3979846666290794e-05
1.631119198313781e-05
0.0002301783661707322
2.9355424660347776e-05
0.0003633213732633131
0.00013665045577163852
1.8588470343612272e-05
0.00017927526100905926
0.00013481134880874673
9.937833338288384e-05
6.761563355462769e-05
4.695617051320833e-05
0.00010648999847253511
1.9522450425865108e-05
6.278745138667063e-05
0.00013998842408831724
6.977805805551592e-05
4.3833587632405475e-05
0.0002990093681733717
0.00024468489253548904
5.2815392488636584e-05
2.9120358968616383e-05
4.8264456936222696e-05
8.912472897854597e-05
8.078049091400737e-05
3.080493108035334e-05
8.477353260354346e-05
0.00016360328077503214
3.30177383686876e-05
9.87398602478292e-05
2.6974217235271637e-05
5.118388008403284e-05
9.51778138748016e-05
0.0003091526180498843
0.00019125131196432297
0.00012811328971647066
0.0002736696341932341
1.2217931480300895e-05
0.00017384736

In [23]:
edges_df.head()

Unnamed: 0,node_1,node_2
0,0,747
1,1,4257
2,1,2194
3,1,580
4,1,6478


In [24]:
labels_df.head(3)

Unnamed: 0,id,target
0,0,8
1,1,17
2,2,3


In [25]:
nodedata = {}
nodedata_tfidf = {}
for idx, row in labels_df.iterrows():
    # print('node', row['id'], len(tf_idf_weights[str(row['id'])]))
    if len(nodedata)%3000 == 0:
        print(len(nodedata))
    nodedata[str(row['id'])] = ('label=' + str(row['target']), {})
    nodedata_tfidf[str(row['id'])] = ('label=' + str(row['target']), {})
    if len(nodedata) % 10000 == 0:
        print(len(nodedata))
    if len(features[str(row['id'])]) == 0:
        nodedata[str(row['id'])][1]['None'] = 1
        nodedata_tfidf[str(row['id'])][1]['None'] = 1
    for word in features[str(row['id'])]: 
        nodedata[str(row['id'])][1][word] = 1
        nodedata_tfidf[str(row['id'])][1][str(word)] = tf_idf_weights[str(row['id'])]['x'+str(word)]

0
3000
6000


In [26]:
jsonpath = data_dir + "data/nodedata.json" 
print(jsonpath)
with open(jsonpath, 'w') as outfile:
    json.dump(nodedata, outfile)

../Graphs/LastFM/data/nodedata.json


In [27]:
jsonpath = data_dir + "data/nodedata_tfidf.json" 
print(jsonpath)
with open(jsonpath, 'w') as outfile:
    json.dump(nodedata_tfidf, outfile)

../Graphs/LastFM/data/nodedata_tfidf.json


In [28]:
labels_df.rename(columns={'id': 'node', 'target': 'label'}).to_csv(data_dir + \
                                                        '/data/nodes_with_labels.csv', index=False)

In [29]:
nodes = set()
labels = set()
word_indices = set()
for node, features in nodedata.items():
    nodes.add(node)
    labels.add(features[0])
    for w in features[1]:
        word_indices.add(str(w))
        
nodes_path = data_dir + "/data/graph_nodes.txt"#os.path.join(data_dir, "graph_nodes.txt")
with open(nodes_path, 'w') as outfile:
    for node in nodes:
        outfile.write(str(node) + '\n')
        
labels_path = data_dir + "/data/labels.txt" #os.path.join(data_dir, "labels.txt")
with open(labels_path, 'w') as outfile:
    for label in labels:
        outfile.write(label + '\n')
        
words_path = data_dir + "/data/words_indices.txt" # os.path.join(data_dir, "words_indices.txt")
with open(words_path, 'w') as outfile:
    for wi in word_indices:
        outfile.write(wi + '\n')
        
edges = []
col1, col2 = edges_df.columns[0], edges_df.columns[1]
for idx, row in edges_df.iterrows():
    edges.append((str(row[col1]).strip(), str(row[col2]).strip()))

In [30]:
edges = []
for idx, row in edges_df.iterrows():
    edges.append((str(row[col1]).strip(), str(row[col2]).strip()))
    
G = nx.Graph()
for edge in edges:
    u = edge[0]
    v = edge[1]
    if u in nodedata and v in nodedata:
        G.add_edge(u, v)

In [31]:
G.number_of_nodes(), G.number_of_edges()

(7624, 27806)

In [32]:
edges_path = data_dir + "/data/all_graph_edges.txt" 
with open(edges_path, 'w') as outfile:
    for edge in G.edges():
        outfile.write(edge[0] + ':' + edge[1] + '\n')