In [20]:
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import community
import pickle 
import pandas as pd

## Todos

TODO: Print who the members of each community are in the various algorithms

TODO: LSA / Semantic analysis

TODO: Add diagnostics showing how long some tasks take to run

TODO: Recreate a graph, maybe in GEPHI, using whatever communinty structure we end up with



## Load Graph
Read in the edge list, which contains all relevant information: source author -> target author, number of times cited

In [21]:
# Use tiny_edge_list.csv and small_edge_list.csv
# G = nx.read_weighted_edgelist("./tiny_edge_list.csv", delimiter=",")
G_directed = nx.read_weighted_edgelist("./small_edge_list.csv", delimiter=",",create_using= nx.DiGraph())
G = nx.read_weighted_edgelist("./small_edge_list.csv", delimiter=",")

In [18]:
G_large = nx.read_weighted_edgelist("./complete_edge_list.csv", delimiter=",",create_using= nx.DiGraph())

In [320]:
# To convert files to graphml 
# nx.write_graphml(G_directed, "small_edge_list.graphml")

In [19]:
# print(nx.info(G_directed))
print(nx.info(G_large))

Name: 
Type: DiGraph
Number of nodes: 9925
Number of edges: 55279
Average in degree:   5.5697
Average out degree:   5.5697


## Various node orderings

In [22]:
# In degree
degreeTable = pd.DataFrame(G_directed.in_degree, columns = ["Name","In Degree"])
sd = degreeTable.sort_values(by=['In Degree'], ascending=False)
sd.head(10)

Unnamed: 0,Name,In Degree
59,HUSSERL E,1108
224,HEIDEGGER M,795
1186,MERLEAUPONTY M,656
888,KANT I,363
2011,SARTRE J,298
2072,DERRIDA J,294
221,ZAHAVI D,285
2207,LEVINAS E,272
706,RICOEUR P,262
2413,HEGEL G,260


In [323]:
degreeTable = pd.DataFrame(G_directed.out_degree, columns = ["Name","Out Degree"])
sd = degreeTable.sort_values(by=['Out Degree'], ascending=False)
sd.head(10)

Unnamed: 0,Name,Out Degree
23,MORAN D,228
221,ZAHAVI D,187
1265,FUCHS T,167
423,GALLAGHER S,149
417,FROESE T,139
162,HEINAMAA S,129
1016,SCHWITZGEBEL E,116
7,DEPRAZ N,111
118,WALSH P,110
1088,KRIEGEL U,110


In [324]:
# Not sure if results differ between G and G_directed...
h,a=nx.hits(G)

In [325]:
authTable =  pd.DataFrame.from_dict(a,orient='index', columns=["Authority"])
sd = authTable.sort_values(by=['Authority'], ascending=False)
sd.head(20)

Unnamed: 0,Authority
HUSSERL E,0.02071
ZAHAVI D,0.012081
HEIDEGGER M,0.008453
MERLEAUPONTY M,0.007427
DEPRAZ N,0.006937
MORAN D,0.006112
CROWELL S,0.00529
HEINAMAA S,0.004736
GALLAGHER S,0.00464
WALDENFELS B,0.003687


In [24]:
hubTable =  pd.DataFrame.from_dict(h,orient='index', columns=["Hubitude"])
sd = hubTable.sort_values(by=['Hubitude'], ascending=False)
sd.head(20)

Unnamed: 0,Hubitude
FINK E,0.081331
BRUZINA R,0.077549
CAIRNS D,0.024191
DEPRAZ N,0.021869
CHERNAVIN G,0.019397
HIMANKA J,0.018945
DASTUR F,0.018945
SMYTH B,0.018323
BERTOLINI S,0.017619
GIUBILATO G,0.017412


In [23]:
centrality = nx.eigenvector_centrality(G_directed)

In [24]:
ec_table =  pd.DataFrame.from_dict(centrality,orient='index', columns=["Centrality"])
ect = ec_table.sort_values(by=['Centrality'], ascending=False)
ect.head(20)

Unnamed: 0,Centrality
HUSSERL E,0.4052
HEIDEGGER M,0.267859
MERLEAUPONTY M,0.241819
ZAHAVI D,0.157583
KANT I,0.143643
SARTRE J,0.13289
FINK E,0.125441
GALLAGHER S,0.116983
DREYFUS H,0.113294
LEVINAS E,0.11187


## Community Detection

See https://networkx.github.io/documentation/stable/reference/algorithms/community.html

### Helper Methods

In [25]:
# Print out the number of members of each communinty and sub-community.  
# Note that results can a generator, in which case after running this once the generator is "unwound" and must be recreated
def conmunity_stats(results, level = 2):
    for l1 in results:
        print("Level 1:", len(l1), "members")
        print_community(l1)
        if level > 1:
            for l2 in l1:
                print("Level 2:", len(l2),"members")
                print_community(l2, hasmembers=False) # TODO: Not working

In [26]:
def print_community(community, hasmembers=True):
    print(len(community))
    if(hasmembers):
        for comm in community:
            print(comm)
    else:
        print(community)

In [27]:
def get_first_level(generator):
    results = []
    for res in generator:
        results.append(res)
    return results

### Girvan Newman

Takes a long time...

In [246]:
cg_gn = community.girvan_newman(G)

In [179]:
top_level = next(cg_gn)

In [180]:
next_level = next(cg_gn)

In [257]:
res = get_first_level(cg_gn)
for cluster in res:
    print("Cluster", len(cluster))
    dt = G_directed.subgraph([i for i in cluster[1]])
    dt = pd.DataFrame(dt.in_degree, columns = ["Name","In Degree"])
    st = dt.sort_values(by=['In Degree'], ascending=False)
    print(st.head(3))
    print()

Cluster 2
           Name  In Degree
1   DAHLSTROM D         37
0   OVERGAARD S          0
28     ADRIAN J          0

Cluster 3
           Name  In Degree
1   DAHLSTROM D         37
0   OVERGAARD S          0
28     ADRIAN J          0

Cluster 4
         Name  In Degree
25   CAIRNS D         56
31  BRUZINA R         24
0    BARBER M          0

Cluster 5
         Name  In Degree
10  BRUZINA R         24
0    PLOTKA W          0
13    SMYTH B          0

Cluster 6
         Name  In Degree
10  BRUZINA R         24
0    PLOTKA W          0
13    SMYTH B          0

Cluster 7
         Name  In Degree
10  BRUZINA R         24
0    PLOTKA W          0
13    SMYTH B          0

Cluster 8
         Name  In Degree
10  BRUZINA R         24
0    PLOTKA W          0
13    SMYTH B          0

Cluster 9
         Name  In Degree
10  BRUZINA R         24
0    PLOTKA W          0
13    SMYTH B          0

Cluster 10
         Name  In Degree
10  BRUZINA R         24
0    PLOTKA W          0
13    SMYT

          Name  In Degree
4    BRUZINA R         18
0   SHESTOVA E          0
10   ROGGERO J          0

Cluster 106
          Name  In Degree
4    BRUZINA R         18
0   SHESTOVA E          0
10   ROGGERO J          0

Cluster 107
          Name  In Degree
4    BRUZINA R         18
0   SHESTOVA E          0
10   ROGGERO J          0

Cluster 108
         Name  In Degree
4   BRUZINA R         17
0  SHESTOVA E          0
9   ROGGERO J          0

Cluster 109
         Name  In Degree
4   BRUZINA R         17
0  SHESTOVA E          0
9   ROGGERO J          0

Cluster 110
         Name  In Degree
4   BRUZINA R         17
0  SHESTOVA E          0
9   ROGGERO J          0

Cluster 111
         Name  In Degree
4   BRUZINA R         17
0  SHESTOVA E          0
9   ROGGERO J          0

Cluster 112
         Name  In Degree
4   BRUZINA R         16
0  SHESTOVA E          0
1    PLOTKA W          0

Cluster 113
         Name  In Degree
4   BRUZINA R         16
0  SHESTOVA E          0
1    PLOT

In [None]:
# sorted(map(sorted, next_level))

In [None]:
# Save communites
# pickle.dump(next_level, open("gn_communities.obj", "wb"))

In [None]:
# Testing re-open
# cg_gn = pickle.load(open("gn_communities.obj", "rb"))

In [None]:
sample = conmunity_stats(cg_gn,1)

### K Clique Communities

In [28]:
cg_kc = community.k_clique_communities(G,2)

In [29]:
conmunity_stats(cg_kc,1)

KeyboardInterrupt: 

In [317]:
res = get_first_level(cg_kc)

KeyboardInterrupt: 

In [None]:
for cluster in res:
    print("Cluster with", len(cluster), "members")
    dt = G_directed.subgraph([i for i in list(cluster)])
    dt = pd.DataFrame(dt.in_degree, columns = ["Name","In Degree"])
    st = dt.sort_values(by=['In Degree'], ascending=False)
    print(st.head(3))
    print()

### Label propagation

In [293]:
cg_label = community.label_propagation_communities(G)

In [294]:
# conmunity_stats(cg_label,1)

In [295]:
res = get_first_level(cg_label)

In [296]:
for cluster in res:
    print("Cluster with", len(cluster), "members")
    dt = G_directed.subgraph([i for i in list(cluster)])
    dt = pd.DataFrame(dt.in_degree, columns = ["Name","In Degree"])
    st = dt.sort_values(by=['In Degree'], ascending=False)
    print(st.head(3))
    print()

Cluster with 36 members
           Name  In Degree
1   DAHLSTROM D         35
0   OVERGAARD S          0
20         DE L          0

Cluster with 101 members
         Name  In Degree
44     FINK E         90
1   BRUZINA R         27
63  COPILAS E          0

Cluster with 4 members
           Name  In Degree
3  CAPOBIANCO R          3
0       FIELD J          0
1     KENNEDY T          0

Cluster with 38 members
            Name  In Degree
11      CAIRNS D         37
0       PLOTKA W          0
28  SOKOLOWSKI R          0



### Bipartition

In [297]:
cg_kg = community.kernighan_lin_bisection(G)

In [298]:
# conmunity_stats(cg_kg,1)

In [299]:
res = get_first_level(cg_kg)
for cluster in res:
    print("Cluster with", len(cluster), "members")
    dt = G_directed.subgraph([i for i in list(cluster)])
    dt = pd.DataFrame(dt.in_degree, columns = ["Name","In Degree"])
    st = dt.sort_values(by=['In Degree'], ascending=False)
    print(st.head(3))
    print()

Cluster with 89 members
            Name  In Degree
0    DAHLSTROM D         38
37  CAPOBIANCO R          5
56           O L          0

Cluster with 90 members
         Name  In Degree
58     FINK E         60
30   CAIRNS D         40
1   BRUZINA R         29



### Greedy

In [300]:
cg_greedy = community.greedy_modularity_communities(G)

In [301]:
# conmunity_stats(cg_greedy,1)

In [302]:
res = get_first_level(cg_greedy)
for cluster in res:
    print("Cluster with", len(cluster), "members")
    dt = G_directed.subgraph([i for i in list(cluster)])
    dt = pd.DataFrame(dt.in_degree, columns = ["Name","In Degree"])
    st = dt.sort_values(by=['In Degree'], ascending=False)
    print(st.head(3))
    print()

Cluster with 58 members
         Name  In Degree
54     FINK E         57
0   JOHNSON F          0
43  LIEBSCH B          0

Cluster with 43 members
          Name  In Degree
15    CAIRNS D         42
0     BARBER M          0
22  TRAKAKIS N          0

Cluster with 39 members
           Name  In Degree
1   DAHLSTROM D         38
20    MCMANUS D          0
21     TANZER M          0

Cluster with 32 members
         Name  In Degree
16  BRUZINA R         32
1     HENRY M          0
30    ONATE A          0

Cluster with 7 members
           Name  In Degree
3  CAPOBIANCO R          6
0       LOZAR J          0
1       HOMAN C          0



### Fluid

In [303]:
cg_fluid = community.asyn_fluidc(G,5)

In [304]:
# conmunity_stats(cg_fluid,1)

In [305]:
res = get_first_level(cg_fluid)
for cluster in res:
    print("Cluster with", len(cluster), "members")
    dt = G_directed.subgraph([i for i in list(cluster)])
    dt = pd.DataFrame(dt.in_degree, columns = ["Name","In Degree"])
    st = dt.sort_values(by=['In Degree'], ascending=False)
    print(st.head(3))
    print()

Cluster with 68 members
         Name  In Degree
64     FINK E         67
0   JOHNSON F          0
42   GHITTI J          0

Cluster with 64 members
            Name  In Degree
2    DAHLSTROM D         36
29     BRUZINA R         25
7   CAPOBIANCO R          5

Cluster with 1 members
          Name  In Degree
0  THEODOROU P          0

Cluster with 1 members
       Name  In Degree
0  HOLMES R          0

Cluster with 45 members
            Name  In Degree
16      CAIRNS D         44
0       BARBER M          0
33  SOKOLOWSKI R          0



### LFR 
Failed

In [85]:
# n = 250
# tau1 = 3
# tau2 = 1.5
# mu = 0.1
# LFR = community.LFR_benchmark_graph(n, tau1, tau2, mu, average_degree=5, min_community=20, seed=10)
# cg_LFR = {frozenset(LFR.nodes[v]['community']) for v in G}

## Draw graph

In [None]:
# nx.draw(G, node_size = 10, with_labels=True, font_size = 12)