In [36]:
import pandas as pd
from neo4j import GraphDatabase
# predefined functions
from create_coauthor import get_coauthor_matrix
from connect_to_neo4j import create_coauthor_graph, calculate_graph_stats_for_field

## Make Coauthor Matrix from WOS(Web of science) data

- use predefined function 'get_coauthor_matrix' to create coauthor matrix
- coauthor matrix is in dict form
- authors are represented as authorID to prevent namesake problem

In [7]:
df_lung_cancer = pd.read_excel('lung_cancer.xls')
coauthor_lung_cancer = get_coauthor_matrix(df_lung_cancer['Researcher Ids'])
coauthor_lung_cancer

defaultdict(<function create_coauthor.get_coauthor_matrix.<locals>.<lambda>()>,
            {'B-1277-2014': defaultdict(int,
                         {'AAC-5192-2020': 1,
                          'L-4554-2015': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          'M-9715-2015': 1}),
             'AAC-5192-2020': defaultdict(int,
                         {'B-1277-2014': 1,
                          'L-4554-2015': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          'M-9715-2015': 1}),
             'L-4554-2015': defaultdict(int,
                         {'B-1277-2014': 1,
                          'AAC-5192-2020': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          '

In [32]:
coauthor_lung_cancer

defaultdict(<function create_coauthor.get_coauthor_matrix.<locals>.<lambda>()>,
            {'B-1277-2014': defaultdict(int,
                         {'AAC-5192-2020': 1,
                          'L-4554-2015': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          'M-9715-2015': 1}),
             'AAC-5192-2020': defaultdict(int,
                         {'B-1277-2014': 1,
                          'L-4554-2015': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          'M-9715-2015': 1}),
             'L-4554-2015': defaultdict(int,
                         {'B-1277-2014': 1,
                          'AAC-5192-2020': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          '

## Connect to Neo4j Server

- we are going to connect to neo4j server using neo4j driver

In [8]:
uri ='bolt://localhost:7687'
user = 'neo4j'
password = '09150915'

driver = GraphDatabase.driver(uri, auth=(user,password))

In [39]:
# use predefined function 'create_coauthor_graph' to make graph to neo4j
nodes_written = create_coauthor_graph(coauthor_lung_cancer,'lung_cancer')
print('Number of Nodes Written : {}'.format(nodes_written))

Number of Nodes Written : 183


If you run this code, nodes will be created in your neo4j server :

![image](assets/node_count.png)
![image](assets/node_visual.png)


## Calculate statistics for graph

- calculate statistics for the created graph
- using neo4j graph data science library
- using the predefined function 'calculate_graph_stats_for_field'

In [42]:
stats = calculate_graph_stats_for_field('lung_cancer')

In [45]:
print(stats)

{'field': 'lung_cancer', 'num_authors': 183, 'num_authors_FastRP': 183, 'num_edges': 111, 'degree_centrality_max': 8.000053405761719, 'degree_centrality_mean': 0.8688546269317794, 'num_community': 103, 'num_authors_in_main_component': 33}


### Repeat process for Super conductor data

In [48]:
# Read File
df_super_conducter = pd.read_excel('super_conducter_science.xls')
# Get Coauthor Matrix
coauthor_super_conducter = get_coauthor_matrix(df_super_conducter['Researcher Ids'])
# Create Coauthor Graph
create_coauthor_graph(coauthor_super_conducter,'super_conducter')
# Calculate Stats for Graph
stats_super_conductor = calculate_graph_stats_for_field('super_conducter')
print(stats_super_conductor)

{'field': 'super_conducter', 'num_authors': 224, 'num_authors_FastRP': 224, 'num_edges': 188, 'degree_centrality_max': 16.00011444091797, 'degree_centrality_mean': 0.8392881665910993, 'num_community': 173, 'num_authors_in_main_component': 30}
