In [None]:
import pandas as pd
import sqlite3

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import plotly.express as px
import plotly.graph_objs as go
import geopandas as gpd
import pickle

Create a dataframe that includes only transactions between npi numbers that are both in the Nashville CBSA

In [None]:
query = """
SELECT from_npi,
       to_npi,
       transaction_count
FROM hop
INNER JOIN npidata AS a
ON from_npi = a.npi
INNER JOIN npidata AS b
ON to_npi = b.npi
INNER JOIN zip_to_cbsa AS az
ON a.location_address_postal_code = az.zip
INNER JOIN zip_to_cbsa AS bz
ON b.location_address_postal_code = bz.zip
WHERE transaction_count >= 50
AND average_day_wait < 50
AND az.cbsa = 34980
AND bz.cbsa = 34980
"""

with sqlite3.connect('../data/hop_db.sqlite') as db: 
    hop_nash_cbsa = pd.read_sql(query, db)
    
db.close()

# We uploaded this file to Neo4j to run the Louvain algorithm to identify communities

In [None]:
#hop_nash_cbsa.to_csv('../data/hop_nash_cbsa.csv')

In [None]:
# This is the csv containing community info
community = pd.read_csv('../data/npi_community.csv')
community.info()

In [None]:
# Creating a query to pull in info to merge with the community df
query = """
SELECT npi, 
       first_name,
       last_name,
       credential, 
       organization_name,
       location_address_city_name, 
       location_address_state_name, 
       location_address_postal_code, 
       classification AS specialty, 
       display_name AS sub_specialty,
       cbsa
FROM npidata
LEFT JOIN taxonomy
USING(taxonomy_code)
INNER JOIN zip_to_cbsa 
ON location_address_postal_code = zip
WHERE cbsa = 34980
"""

with sqlite3.connect('../data/hop_db.sqlite') as db: 
    nash_referrers = pd.read_sql(query, db)
    
db.close()

In [None]:
nash_ref_com = pd.merge(community, nash_referrers, how = 'left', left_on = 'NPI', right_on = 'npi')

In [None]:
nash_ref_com

In [None]:
nash_ref_com['location_address_city_name'].value_counts()


In [None]:
comm_zip_count = (
    nash_ref_com
    .groupby(['communityId', 'location_address_postal_code'])
    .agg(num_providers = ('npi', 'count'),
         num_cities = ('location_address_city_name', 'nunique'),
         num_specialties = ('specialty', 'nunique'),
         num_subspecialties = ('sub_specialty', 'nunique'),
         num_orgs = ('organization_name', 'count'))
    .reset_index()
    .sort_values(by = ['communityId', 'num_providers'], ascending = [True, False])
)

comm_zip_count.info()

In [None]:
large_comm = (
    nash_ref_com
    .groupby('communityId')
    ['npi'].count()
    .sort_values(ascending = False)
    .head(5)
    .to_frame()
    .reset_index()
)
large_comm['communityId'] = large_comm['communityId'].astype(str)
large_comm

In [None]:
comm_zip_count['rank'] = (
    comm_zip_count
    .groupby('communityId')
    ['num_providers'].rank(ascending = False)
)

comm_zip_count['communityId'] = comm_zip_count['communityId'].astype(str)

comm_zip_count

In [None]:
(
    comm_zip_count.loc[comm_zip_count['communityId'].isin(large_comm['communityId'])]
)

In [None]:
df = (
    comm_zip_count
    .loc[(comm_zip_count['rank'] < 4)
         &
         (comm_zip_count['communityId'].isin(large_comm['communityId']))]
    .sort_values('location_address_postal_code')
)

fig = px.bar(df,
             x='location_address_postal_code', 
             y='num_providers',
             color='communityId',
             labels = {
                 'communityId' : 'Community',
                 'location_address_postal_code' : 'Provider ZIP code', 
                 'num_providers' : 'Total Providers',
                 'num_subspecialties' : 'Total Provider Sub-Specialties'},
             color_discrete_sequence=px.colors.qualitative.Bold,
             hover_data={'num_subspecialties' : True},
             template="simple_white")

#fig.update_xaxes(title=None,
#                 showticklabels=False)


fig.show()

In [None]:
# Number of communities that have providers in each zip code
nash_ref_com.groupby('location_address_postal_code')['communityId'].nunique().sort_values(ascending = False).head(15)

In [None]:
# create a dataframe that gives overall demographic info for each community
comm_dems = (
    nash_ref_com
    .groupby('communityId')
    .agg(num_providers = ('npi', 'count'),
         num_zip = ('location_address_postal_code', 'nunique'),
         num_cities = ('location_address_city_name', 'nunique'),
         num_specialties = ('specialty', 'nunique'),
         num_subspecialties = ('sub_specialty', 'nunique'),
         num_orgs = ('organization_name', 'count'))
    .sort_values('num_providers', ascending = False)
)

comm_dems

In [None]:
# Looking at the largest community's demographics by zip code
(
    nash_ref_com
    .loc[nash_ref_com['communityId'] == 3452]
    .groupby('location_address_postal_code')
    .agg(num_providers = ('npi', 'count'),
         num_cities = ('location_address_city_name', 'nunique'),
         num_specialties = ('specialty', 'nunique'),
         num_subspecialties = ('sub_specialty', 'nunique'),
         num_orgs = ('organization_name', 'count'))
    .reset_index()
)
    