In [1]:
import pandas as pd
import numpy as np

# Loading the nodes file

In [2]:
path_to_nodes_file = "/kaggle/input/wikipedia-graph-dataset/wiki-topcats-page-names.txt"
with open(path_to_nodes_file, 'r') as f:
    lines = f.readlines()

In [3]:
ids = []
names = []
for line in lines:
    line = line.strip(" ").strip("\n")
    _id, name = line.split(" ", 1)
    ids.append(_id)
    names.append(name)

In [4]:
df_names = pd.DataFrame({
    'id':ids,
    'names':names
})

In [5]:
df_names.head()

Unnamed: 0,id,names
0,0,Chiasmal syndrome
1,1,Kleroterion
2,2,Pinakion
3,3,LyndonHochschildSerre spectral sequence
4,4,Zariski's main theorem


In [6]:
#check if id column does not have any characters instead of numbers
df_names['id'] = df_names['id'].astype(np.int64)

In [7]:
#is the same id repeated more than once ?
max(df_names['id'].value_counts())

1

# Loading the edges file

In [8]:
path_to_edges_file = "/kaggle/input/wikipedia-graph-dataset/wiki-topcats.txt"
with open(path_to_edges_file, 'r') as f2:
    lines = f2.readlines()

In [9]:
lines[0]

'0 10772\n'

In [10]:
source = []
target = []
for line in lines:
    line = line.strip(" ").strip("\n")
    _s, _t = line.split(" ")
    source.append(_s)
    target.append(_t)

In [11]:
df_edges = pd.DataFrame({
    'Source':source,
    'Target':target
})

In [12]:
#check if these two columns contain any characters or not
df_edges['Source'] = df_edges['Source'].astype(np.int64)
df_edges['Target'] = df_edges['Target'].astype(np.int64)

In [13]:
df_edges['Type'] = 'Directed'
df_edges['Weight'] = 1.0

In [14]:
df_edges.head()

Unnamed: 0,Source,Target,Type,Weight
0,0,10772,Directed,1.0
1,1,2,Directed,1.0
2,1,170193,Directed,1.0
3,1,598775,Directed,1.0
4,2,1,Directed,1.0


In [15]:
#the choice of removing self referencing edges depends on the problem statement, im choosing to delete them.
#in the context of pagerank, a page adding importance to itself is not meaningful.
df_edges = df_edges[~(df_edges['Source'] == df_edges['Target'])].reset_index(drop = True)

In [16]:
#uncomment the lines below to get the full graph

#df_names.to_csv('./wiki_articles_names.csv', index = False)
#df_edges.to_csv('./wiki_articles_edges.csv', index = False)

# Creating Samples

In [17]:
in_degree = df_edges['Target'].value_counts()
out_degree = df_edges['Source'].value_counts()
degree = in_degree + out_degree

In [38]:
fraction = 0.005
seed = 204
sample_nodes_df = df_names.sample(int(len(df_names)*(fraction)), weights = degree, replace = False, random_state = seed).reset_index(drop = True)

In [39]:
sample_nodes = sample_nodes_df['id'].to_list()

In [40]:
sample_nodes_df = df_names[df_names['id'].isin(sample_nodes)].reset_index(drop = True)
sample_edges_df = df_edges[(df_edges['Source'].isin(sample_nodes)) & (df_edges['Target'].isin(sample_nodes))].reset_index(drop = True)

In [41]:
print(sample_nodes_df.shape)
print(sample_edges_df.shape)

(8957, 2)
(68740, 4)


In [42]:
sample_nodes_df.to_csv(f"./sample_nodes_{fraction}_{seed}.csv", index = False)
sample_edges_df.to_csv(f"./sample_edges_{fraction}_{seed}.csv", index = False)