In [2]:
import networkx as nx
import matplotlib.pyplot as plt

data_path = './data/'
reduced_graph_path = data_path + 'wiki-topcats-reduced.txt'
page_names_path = data_path + 'wiki-topcats-page-names.txt'
categories_path = data_path + 'wiki-topcats-categories.txt'

In [3]:
H = nx.read_edgelist(path=reduced_graph_path, delimiter="\t", create_using=nx.DiGraph())

### RQ1

In [4]:
print(len(H.edges.items()))
print(len(H.nodes.items()))
print(nx.is_directed(H))

'''
In mathematics, a dense graph is a graph in which the number
of edges is close to the maximal number of edges.
'''
print(nx.density(H))

# mean degree value
mean = []

for n in H.nodes.items():
    mean.append(nx.degree(H, n[0]))

print(sum(mean) / len(mean))

2645247
461193
True
1.2436602635647606e-05
11.471323285479182


### RQ2

We need to preprocess wiki-topcats-categories.txt file as we should consider categories that have 3500 articles, at least. We could do this using the famous bash tool "awk":

```bash
awk 'NF > 3501' data/wiki-topcats-categories.txt > data/wiki-topcats-categories__3500.txt
```

- NF is defined as 'Number of Fields' indicating the number of items inside a row (more generally, columns)
- The value of 3501 is justified as we are considering the Category label (ex. Category:Telugu_actors; 581455 581966 582010 582033 582071 ... )

Eventually, we count the lines of the resulting file (using another terminal tool called "wc")

```bash
wc -l data/wiki-topcats-categories__3500.txt
```

and we notice that the set of Categories has been narrowed down to **35** items from **17364**.

```bash
cut -f 1 -d ' ' data/wiki-topcats-categories__3500.txt
```

- "f" is the field position (or first columns, in this case)
- "d" is the delimiter (space)

These are the categories that satisfy our threshold

```
Category:English_footballers;
Category:The_Football_League_players;
Category:Association_football_forwards;
Category:Association_football_goalkeepers;
Category:Association_football_midfielders;
Category:Association_football_defenders;
Category:Living_people;
Category:Year_of_birth_unknown;
Category:Harvard_University_alumni;
Category:Major_League_Baseball_pitchers;
Category:Members_of_the_United_Kingdom_Parliament_for_English_constituencies;
Category:Indian_films;
Category:Year_of_death_missing;
Category:English_cricketers;
Category:Year_of_birth_missing_(living_people);
Category:Rivers_of_Romania;
Category:Main_Belt_asteroids;
Category:Asteroids_named_for_people;
Category:English-language_albums;
Category:English_television_actors;
Category:British_films;
Category:English-language_films;
Category:American_films;
Category:Fellows_of_the_Royal_Society;
Category:People_from_New_York_City;
Category:American_Jews;
Category:American_television_actors;
Category:American_film_actors;
Category:Debut_albums;
Category:Black-and-white_films;
Category:Year_of_birth_missing;
Category:Place_of_birth_missing_(living_people);
Category:Article_Feedback_Pilot;
Category:American_military_personnel_of_World_War_II;
Category:Windows_games;
```

In [24]:
# This has to be a list of lists where each element represents a category (list of articles)
block_ranking = []
categ2articles = {}
articles2categ = {}

with open(data_path + 'wiki-topcats-categories__3500.txt', 'r') as most_linked_categs:
    for i, line in enumerate(most_linked_categs):
        line = line.rstrip('\n')
        line = line.split()
        
        categ_name, articles = line[0].lstrip('Category:').rstrip(';'), line[1:]
        
        # C0 has to be added into block_ranking
        if i == 0:
            block_ranking.append(categ_name)
        
        # categ2articles
        if not categ_name in categ2articles:
            categ2articles[categ_name] = set(articles)
        else:
            categ2articles[categ_name].update(articles)
        
        # articles2categ
        for a in articles:
            if not a in articles2categ.items():
                articles2categ[a] = set([categ_name])
            else:
                articles2categ[a].add(categ_name)

In [38]:
print('Categories:', len(categ2articles.items()))
print('Category in position 0:', list(categ2articles.items())[0][0], len(list(categ2articles.items())[0][1]))
print('Category in position 1:', list(categ2articles.items())[1][0], len(list(categ2articles.items())[1][1]))

Categories: 35
Category in position 0: English_footballers 9237
Category in position 1: The_Football_League_players 9467


Now, we should compute the Shortest Path algorithm to get the nearest and the furthest Category from C0