In [2]:
import pandas as pd
import numpy as np
import networkx as nx

G = nx.read_gml('./graph.gml')

### Number of nodes and edges of the network graph

In [7]:
print(f"Number of nodes: {len(nx.nodes(G))}")
print(f"Number of edges: {len(nx.edges(G))}")

Number of nodes: 3056
Number of edges: 92702


### Network Density

In [5]:
print(nx.density(G))

0.01985887010394084


### Isolates

In [5]:
count = 0
for i in nx.isolates(G):
    # print(i)
    count += 1

print(count)

306


### Degree Centrality

In [11]:
centrality = nx.degree_centrality(G)
top_degree_centrality = sorted(centrality.items(), key = lambda x: x[1], reverse=True)

degree_df = pd.DataFrame( data = {"certificate": [ c[0] for c in top_degree_centrality ], "degree_centrality": [ c[1] for c in top_degree_centrality] })

In [12]:
print("Top 5 certs with highest degree centrality")
degree_df[:5]

Top 5 certs with highest degree centrality


Unnamed: 0,certificate,degree_centrality
0,DATA ANALYSIS FUNDAMENTALS USING MICROSOFT EXC...,0.397054
1,PROJECT MANAGEMENT FOR ALL,0.389525
2,ICAGILE - AGILE TEAM FACILITATION,0.379705
3,BLOCKCHAIN FOR CROSS BORDER PAYMENT,0.361702
4,CYBER SECURITY PROTECTION CERTIFICATE (CSPC),0.345336


In [13]:
print("Last 5 certs with lowest degree centrality")
degree_df[-5:]

Last 5 certs with lowest degree centrality


Unnamed: 0,certificate,degree_centrality
3051,Housekeeping Operation (Outpatient),0.0
3052,Practical Business Finance for Engineering and...,0.0
3053,Advanced Diploma in Logistics and Supply Chain...,0.0
3054,Preparatory Course for Undergraduate Physics,0.0
3055,BCLS + AED Provider,0.0


### Betweenness Centrality

In [10]:
# High betweenness centralities suggest that these courses were important brokers=
print("Nodes with High Betweeness Centrality")
print()

betweenness = nx.betweenness_centrality(G, normalized=False)
top_betweenness = sorted(betweenness.items(), key = lambda x: x[1], reverse=True)

between_df = pd.DataFrame( data = {"certificate": [ c[0] for c in top_betweenness ], "betweenness": [ c[1] for c in top_betweenness] })
between_df[:5]

Nodes with High Betweeness Centrality



Unnamed: 0,certificate,betweenness
0,DATA ANALYSIS FUNDAMENTALS USING MICROSOFT EXC...,245646.983764
1,PROJECT MANAGEMENT FOR ALL,199078.896099
2,ICAGILE - AGILE TEAM FACILITATION,185257.545378
3,CYBER SECURITY PROTECTION CERTIFICATE (CSPC),141984.123965
4,BLOCKCHAIN FOR CROSS BORDER PAYMENT,106575.302195


In [14]:
print("Nodes with High Betweeness Centrality")
print()

between_df[-5:]

Nodes with High Betweeness Centrality



Unnamed: 0,certificate,betweenness
3051,Housekeeping Operation (Outpatient),0.0
3052,Practical Business Finance for Engineering and...,0.0
3053,Advanced Diploma in Logistics and Supply Chain...,0.0
3054,Preparatory Course for Undergraduate Physics,0.0
3055,BCLS + AED Provider,0.0


### Looking at certs with 0 betweenness centrality but are still connected to other certs

In [10]:
btwn_deg_df = pd.merge(degree_df, between_df, on="certificate")
btwn_deg_df[(btwn_deg_df["betweenness"] == 0) & (btwn_deg_df["degree_centrality"] != 0)]

Unnamed: 0,certificate,degree_centrality,betweenness
1290,NICF - IDENTITY WITH WINDOWS SERVER (SF) (SYNC...,0.013093,0.0
1515,Where is the Market Heading? Macroeconomics Vi...,0.008838,0.0
1516,SMU-BOS Advanced Certificate in Private Banking,0.008838,0.0
1545,Advanced Certificate in Healthcare Management ...,0.008511,0.0
1562,SGUS APPLIED VR AND GAME SPECIALIST,0.008183,0.0
...,...,...,...
2745,Diploma in Fashion Marketing and Management,0.000327,0.0
2746,Neural Networks and Deep Learning,0.000327,0.0
2747,BUILDING CONSTRUCTION SUPEVISORS SAFETY COURSE,0.000327,0.0
2748,Advanced Diploma in Cyber Security,0.000327,0.0


### Modularity Classes

In [17]:
from networkx.algorithms.community import greedy_modularity_communities

classes = greedy_modularity_communities(G), key=len, reverse=True
c = sorted()
len(c)

### Highest weight of edges

In [31]:
weights = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)

count = 20
for item in weights:
    if item[0] != item[1] and count > 0:
        print(item)
        count -= 1

('SAFE MANAGEMENT OFFICER COURSE FOR CONSTRUCTION', 'SAFE MANAGEMENT OFFICER TRAINING', {'weight': 13317})
('SAFE MANAGEMENT OFFICER COURSE FOR CONSTRUCTION', 'UNDERSTANDING & APPLYING WICA & WSHA', {'weight': 6562})
('SAFE MANAGEMENT OFFICER COURSE FOR CONSTRUCTION', 'MAINTENANCE SAFETY COURSE LOCK-OUT PROCEDURES', {'weight': 5018})
('SAFE MANAGEMENT MEASURES - PREPARING WORKPLACE FOR COVID-19', 'SAFE MANAGEMENT OFFICER COURSE FOR CONSTRUCTION', {'weight': 3878})
('CPR(HANDS-ONLY)+AED PROVIDER', 'OCCUPATIONAL FIRST AID COURSE', {'weight': 3596})
('CPR(HANDS-ONLY)+AED PROVIDER', 'SAFE MANAGEMENT OFFICER COURSE FOR CONSTRUCTION', {'weight': 2631})
('OCCUPATIONAL FIRST AID COURSE', 'SAFE MANAGEMENT OFFICER COURSE FOR CONSTRUCTION', {'weight': 2586})
('SAFE MANAGEMENT OFFICER TRAINING', 'UNDERSTANDING & APPLYING WICA & WSHA', {'weight': 2346})
('SAFE MANAGEMENT OFFICER TRAINING', 'MAINTENANCE SAFETY COURSE LOCK-OUT PROCEDURES', {'weight': 1794})
('BLOCKCHAIN FOR CROSS BORDER PAYMENT', 'BL

### Certs connected to Data Analytics Using Power BI

In [3]:
analytics_cluster = list( G.neighbors("Data Analytics Using Power BI") )

neighbours_dict = {}
for n in analytics_cluster:
    edge = G.get_edge_data("Data Analytics Using Power BI", n)
    neighbours_dict[n] = edge['weight']

# Sort according to the edge weight
analytics_sorted = sorted(neighbours_dict.items(), key=lambda pair: pair[1], reverse=True)

In [4]:
analytics_sorted[:5]

[('Data Analytics @ Work', 58),
 ('Data Analytics Using Advanced Power BI', 43),
 ('Tableau: Unlocking Insights with Analytics', 25),
 ('DATA ANALYSIS FUNDAMENTALS USING MICROSOFT EXCEL (IBF)', 16),
 ('Data Analytics for Managers', 10)]