In [64]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import text, inspect, MetaData

from pathlib import Path
import pandas as pd
from itertools import product

import nxviz as nv
import networkx as nx
from nxviz import layouts, plots, lines
from nxviz import nodes, edges, annotate, highlights
from nxviz.plots import despine, rescale, respine, aspect_equal

from nxviz.utils import edge_table, node_table
from nxviz import encodings as aes

nxviz has a new API! Version 0.7.4 onwards, the old class-based API is being
deprecated in favour of a new API focused on advancing a grammar of network
graphics. If your plotting code depends on the old API, please consider
pinning nxviz at version 0.7.4, as the new API will break your old code.

To check out the new API, please head over to the docs at
https://ericmjl.github.io/nxviz/ to learn more. We hope you enjoy using it!

(This deprecation message will go away in version 1.0.)



In [37]:
basefolder = Path.home().joinpath("Documents", "data", "opendatasus")
dbname = "SIM_WAREHOUSE.db"

In [38]:
engine = create_engine(f'sqlite:///'+str(basefolder.joinpath(dbname)))

In [47]:
# -- test

def query_metadata(engine):
    inspector = inspect(engine)
    tables = inspector.get_table_names()
    table_dict = { table_name : inspector.get_columns(table_name) for table_name in tables }
    return table_dict


def query_data(query_str, table_name, engine, batchsize=1000):

    schema_data = {
        'rows': [],
        'columns': [],
    }

    with engine.connect() as conn:
        qres = conn.execute(query_str)
        schema_data['columns'] = list(qres.keys())

        while True:
            rows = qres.fetchmany(batchsize)
            if not rows:
                break
            schema_data["rows"] += [ row for row in rows ]
    
    res_df = pd.DataFrame(schema_data['rows'], columns=schema_data['columns'])
    return res_df

In [48]:
# teste -- SIM --
#db_metadata = query_metadata(engine)
#db_metadata

In [51]:
query_str = f'''
    SELECT * FROM sim LIMIT 5
'''

df = query_data(query_str, 'sim', engine)
print(df.columns)
print(df.info())
df.head(3)

Index(['CHAVE_CONTADOR_FONTE', 'TIPOBITO', 'DTOBITO', 'NATURAL', 'DTNASC',
       'IDADE', 'SEXO', 'RACACOR', 'ESTCIV', 'ESC', 'OCUP', 'CODMUNRES',
       'LOCOCOR', 'CODMUNOCOR', 'IDADEMAE', 'ESCMAE', 'OCUPMAE', 'QTDFILVIVO',
       'QTDFILMORT', 'GRAVIDEZ', 'GESTACAO', 'PARTO', 'OBITOPARTO', 'PESO',
       'OBITOGRAV', 'OBITOPUERP', 'ASSISTMED', 'EXAME', 'CIRURGIA',
       'NECROPSIA', 'CAUSABAS', 'LINHAA', 'LINHAB', 'LINHAC', 'LINHAD',
       'LINHAII', 'CIRCOBITO', 'ACIDTRAB', 'FONTE_DADOS'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CHAVE_CONTADOR_FONTE  5 non-null      object
 1   TIPOBITO              5 non-null      object
 2   DTOBITO               5 non-null      object
 3   NATURAL               5 non-null      object
 4   DTNASC                5 non-null      object
 5   IDADE               

Unnamed: 0,CHAVE_CONTADOR_FONTE,TIPOBITO,DTOBITO,NATURAL,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,...,NECROPSIA,CAUSABAS,LINHAA,LINHAB,LINHAC,LINHAD,LINHAII,CIRCOBITO,ACIDTRAB,FONTE_DADOS
0,1DOAC2000,2,2000-03-23 00:00:00.000000,812,1976-09-11 00:00:00.000000,423,2,1.0,1.0,3.0,...,,V892,*T794,*T07X,*V892,,,1.0,2.0,DOAC2000
1,2DOAC2000,2,2000-03-25 00:00:00.000000,812,1999-10-14 00:00:00.000000,305,2,,1.0,1.0,...,,R95,*R69X,,,,*R69X,,,DOAC2000
2,3DOAC2000,2,2000-02-25 00:00:00.000000,812,2000-02-18 00:00:00.000000,207,1,1.0,,,...,,R98,*R98X,,,,,,,DOAC2000


In [56]:
query_causa = f'''
    SELECT 
        CHAVE_CONTADOR_FONTE, CODMUNRES, DTOBITO, 
        SUBSTR(CAUSABAS, 1, 3) AS CAUSABAS, LINHAA, LINHAB, LINHAC, 
        LINHAD, LINHAII
    FROM
        sim
    WHERE DTOBITO >= '2015-01-01' AND DTOBITO <= '2015-12-31'
'''

df = query_data(query_causa, 'sim', engine)
print(df.columns)
print(df.info())
df.head(3)

Index(['CHAVE_CONTADOR_FONTE', 'CODMUNRES', 'DTOBITO', 'CAUSABAS', 'LINHAA',
       'LINHAB', 'LINHAC', 'LINHAD', 'LINHAII'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260715 entries, 0 to 1260714
Data columns (total 9 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   CHAVE_CONTADOR_FONTE  1260715 non-null  object
 1   CODMUNRES             1260715 non-null  object
 2   DTOBITO               1260715 non-null  object
 3   CAUSABAS              1260715 non-null  object
 4   LINHAA                1219299 non-null  object
 5   LINHAB                979520 non-null   object
 6   LINHAC                618600 non-null   object
 7   LINHAD                267688 non-null   object
 8   LINHAII               423081 non-null   object
dtypes: object(9)
memory usage: 86.6+ MB
None


Unnamed: 0,CHAVE_CONTADOR_FONTE,CODMUNRES,DTOBITO,CAUSABAS,LINHAA,LINHAB,LINHAC,LINHAD,LINHAII
0,1DOAC2015,120060,2015-11-21 00:00:00.000000,R99,*R99X,,,,
1,2DOAC2015,120050,2015-02-02 00:00:00.000000,E14,*I10X,*E149,,,
2,3DOAC2015,120050,2015-02-08 00:00:00.000000,J43,*J439,,,,


In [65]:
def define_edgelist(sim_df):
    '''

    '''
    cols = ["CAUSABAS", "LINHAA", "LINHAB", "LINHAC", "LINHAD", "LINHAII"]
    subset_df = df[cols].copy()

    subset_lst = subset_df.apply(lambda x: list(x), axis=1).tolist()
    aux = []
    for current_list in subset_lst:
        new_el = []
        for element in current_list:
            if pd.notna(element):
                cur = [ el for el in element.strip().split("*") if el!='' ]
                new_el += cur
        aux.append([ el[:3] for el in  new_el ] )

    # -- create list of pairs (order not relevant)
    list_of_pairs = []
    for curr in aux:
        pairs = list(set(tuple(sorted(t)) for t in product(curr, curr) if t[0] != t[1]))
        list_of_pairs += pairs

    edgelist = pd.Series(list_of_pairs).value_counts().reset_index()
    edgelist['source'] = edgelist['index'].apply(lambda x: x[0])
    edgelist['target'] = edgelist['index'].apply(lambda x: x[1])
    edgelist = edgelist.drop('index', axis=1)
    return edgelist

def create_network(edgelist, cutoff=0):
    edgelist1 = edgelist[edgelist['count']>cutoff]

    nodenames = edgelist1['source'].unique().tolist() + edgelist1['target'].unique().tolist()
    nodenames = list(set(nodenames))
    nodelabels = { nodenames[label]: label for label in range(len(nodenames)) }
    
    node_metadata = []
    for label, node in enumerate(nodenames):
        node_metadata.append((
            label, { 'cid10': nodenames[label] }
        ))
    
    edge_metadata = []
    for edge, row in edgelist1.iterrows():
        edge_metadata.append(
            (nodelabels[row['source']], nodelabels[row['target']], {'count': row['count']})
        )
    
    graph = nx.Graph()
    
    graph.add_nodes_from(node_metadata)
    graph.add_edges_from(edge_metadata)
    return graph


In [62]:
edgelist = define_edgelist(df)

In [63]:
edgelist

Unnamed: 0,count,source,target
0,105335,A41,J18
1,55083,E14,I10
2,51303,J18,J96
3,42875,I10,I21
4,35812,A41,I10
...,...,...,...
114341,1,B23,E72
114342,1,B23,R10
114343,1,R09,W22
114344,1,J06,S06


In [66]:
graph = create_network(edgelist, cutoff=5)

In [68]:
print(graph.number_of_nodes(), graph.number_of_edges())

1235 31517
