# Creating a topic map visualization using concepts 
* https://networkx.github.io/documentation/stable/reference/introduction.html
* https://pyvis.readthedocs.io/en/latest/tutorial.html

In [1]:
!pip install pyvis jsonpickle plotly dimcli -U --quiet

In [2]:
import dimcli
from dimcli.shortcuts import *
from dimcli.core.extras import NetworkViz as Network # custom version of pyvis - colab-compatible

import json
import sys
import pandas as pd
import networkx as nx
import itertools

print("==\nLogging in..")
# https://github.com/digital-science/dimcli#authentication
ENDPOINT = "https://app.dimensions.ai"
if 'google.colab' in sys.modules:
  import getpass
  USERNAME = getpass.getpass(prompt='Username: ')
  PASSWORD = getpass.getpass(prompt='Password: ')
  dimcli.login(USERNAME, PASSWORD, ENDPOINT)
else:
  USERNAME, PASSWORD  = "", ""
  dimcli.login(USERNAME, PASSWORD, ENDPOINT)
dsl = dimcli.Dsl()

==
Logging in..
[2mDimcli - Dimensions API Client (v0.7.2)[0m
[2mConnected to: https://app.dimensions.ai - DSL v1.26[0m
[2mMethod: dsl.ini file[0m


## Step 1: Get some data 

This query will return a list of documents with related concepts. 

Try changing the query in order to get different results.

In [3]:
q = """search publications 
            for "\\"Semantic Web\\"" 
        return publications[id+title+concepts_scores] 
        sort by times_cited limit 1000"""



data = dsl.query(q)
concepts = data.as_dataframe_concepts()
print("Total concepts:", len(concepts))
print("Concepts score average", concepts['score_avg'].mean())
concepts.head()

Returned Publications: 1000 (total = 142083)
Total concepts: 46017
Concepts score average 0.37944545037703453


Unnamed: 0,id,title,concepts_count,concept,score,frequency,score_avg
0,pub.1007137639,Building better batteries,67,materials science,0.06888,4,0.20473
1,pub.1007137639,Building better batteries,67,new series,0.06877,3,0.05128
2,pub.1007137639,Building better batteries,67,better batteries,0.06703,1,0.06703
3,pub.1007137639,Building better batteries,67,batteries,0.0607,3,0.0325
4,pub.1007137639,Building better batteries,67,Murray-Rust,0.05667,3,0.0217


## Step 2: Build a Network Data Structure 

* for each publication add all nodes with selected features (eg score > MIN, FREQUENCY > MIN)
* also, for each publication create edges among all of them
    * generate all possible combinations for a single pub concepts
    * if edge already exists, do a +1 on its strength
* as a second step, keep only nodes that have an edge > MIN WEIGHT

In [4]:
G = nx.Graph()

# play with these parameters to make a more interesting network
MIN_CONCEPT_SCORE = 0.6
MIN_CONCEPT_FREQUENCY = 4
MIN_EDGE_WEIGHT = 2

top_concepts = concepts.query(f"score_avg >= {MIN_CONCEPT_SCORE} & frequency >=  {MIN_CONCEPT_FREQUENCY}")


#
# build nodes from concepts, including score_avg and frequency
#
mean_score = top_concepts['score_avg'].mean()
for index, row in top_concepts.drop_duplicates("concept").iterrows():
    score_bucket = 1 if row['score_avg'] > mean_score else 2
    G.add_node(row['concept'],frequency=row['frequency'], score_avg=row['score_avg'], score_bucket=score_bucket)
print("Nodes:", len(G.nodes()), "Edges:", len(G.edges()))

#
# build edges, based on concepts co-occurrence within pubs
# -- calculate a 'weight' based on how often two concepts co-occur
#
pubs_list = top_concepts.drop_duplicates("id")['id'].to_list()

for p in pubs_list:
    concepts_for_this_pub = top_concepts[top_concepts['id'] == p]['concept'].to_list()
    for group in itertools.combinations(concepts_for_this_pub, 2):  # gen all permutations
        a, b = group[0], group[1]
        try:
            G.edges[a, b]['weight'] = G.edges[a, b]['weight'] + 1 
        except:
            G.add_edge(a, b, weight=1)
            
print("Nodes:", len(G.nodes()), "Edges:", len(G.edges()))

#
# this extra step is useful to remove low-weight connections
#

print(f".. cleaning up edges with weight < {MIN_EDGE_WEIGHT}...")

for a, b, w in list(G.edges(data='weight')):
    if w < MIN_EDGE_WEIGHT:
        G.remove_edge(a, b)

print("..Done")
print(" => Nodes:", len(G.nodes()), "Edges:", len(G.edges()))


Nodes: 174 Edges: 0
Nodes: 174 Edges: 1063
.. cleaning up edges with weight < 2...
..Done
 => Nodes: 174 Edges: 200


## Step 3: Visualize

* NOTE the `from_nx` method doesn't carry through the WEIGHT or any other value
* so we need to set it manually using via another pass eg example  - see https://github.com/WestHealth/pyvis/issues/16

In [5]:
viznet = Network(notebook=True, width="100%", height="800px")
viznet.toggle_hide_edges_on_drag(True)
viznet.barnes_hut()
viznet.repulsion(300)

# reuse plotly color palette
import plotly.express as px
palette = px.colors.diverging.Temps  # 7 colors

viznet.from_nx(G)


# update visual features 

for node in viznet.nodes:
    freq = G.nodes[node['label']]['frequency']
    score_avg = G.nodes[node['label']]['score_avg']
    score_bucket = G.nodes[node['label']]['score_bucket'] # get from original network

    node['size'] = freq * 2
    node['color'] = palette[score_bucket*3]
    node['borderWidthSelected'] = 5
    node['title'] = f"<h4>Concept: '{node['label']}'</h4><hr>Frequency: {freq}<br>Score avg: {score_avg}",
    # print(node)
for edge in viznet.edges:
    # get value from main Network weight
    edge['value'] = G.edges[edge['from'], edge['to']]['weight']
    # print(edge)
    
viznet.show("test.html")

