In [1]:
import json
import csv
import requests
import pandas as pd
from scipy import stats #For Compressed Sparse Row Format
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
json_path = 'C:/Jayesh/Courses/Intro to Data Science/Project/prescriptionbasedprediction/roam_prescription_based_prediction.jsonl'

In [3]:
def through_data():
    with open(json_path, 'rt') as f:
        for line in f:
            temp = json.loads(line)
            yield (temp['cms_prescription_counts'], temp['provider_variables'], temp['npi'])

In [4]:
def dataset(drug_mincount=20, specialty_mincount=20):
    # Initial full dataset:
    data = [(p_dict, y_dict, npi_dict) for p_dict, y_dict, npi_dict in through_data()
            if len(p_dict) >= drug_mincount]
    
    specialties = Counter([y_dict['specialty'] for _, y_dict, _ in data])
    
    specialties = set([s for s,c in specialties.items()
                       if c >= specialty_mincount]) 
    data = [(phi, ys, npi) for phi, ys, npi in data
            if ys['specialty'] in specialties]
    
    # Process the dataset into an array and a pandas frame:
    feats, ys, npi = zip(*data)
    vectorizer = DictVectorizer(sparse=True)    
    X = vectorizer.fit_transform(feats)
    X = TfidfTransformer().fit_transform(X)        
    ys = pd.DataFrame(list(ys))
    npi = pd.DataFrame(list(npi))
    return (X, ys, vectorizer, npi)

In [5]:
sparse_matrix, dataset, feature_drug, npi = dataset()

In [6]:
sparse_matrix

<76415x2305 sparse matrix of type '<class 'numpy.float64'>'
	with 3973363 stored elements in Compressed Sparse Row format>

In [7]:
sparse_matrix.todense()

matrix([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.26103461,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

In [8]:
dataset.head()

Unnamed: 0,brand_name_rx_count,gender,generic_rx_count,region,settlement_type,specialty,years_practicing
0,384,M,2287,South,non-urban,Nephrology,7
1,316,M,1035,West,non-urban,Nephrology,6
2,374,M,2452,Northeast,urban,Gastroenterology,5
3,683,M,3462,Midwest,urban,Psychiatry,7
4,143,M,2300,Northeast,urban,Psychiatry,7


In [9]:
npi.head()

Unnamed: 0,0
0,1295763035
1,1437366804
2,1316196462
3,1215979554
4,1174564165


In [10]:
drug_names=feature_drug.get_feature_names()

In [11]:
drug_names[0:5]

['1ST TIER UNIFINE PENTIPS',
 'ABACAVIR',
 'ABELCET',
 'ABILIFY',
 'ABILIFY DISCMELT']

In [12]:
npi["state"] = "NULL"
npi[:5]

Unnamed: 0,0,state
0,1295763035,
1,1437366804,
2,1316196462,
3,1215979554,
4,1174564165,


In [13]:
def get_account_info():
    response = requests.get(api_url)
    if response.status_code == 200:
        return json.loads(response.content.decode('utf-8'))["data"]["licenses"][0]["state"]
    else:
        return None

In [14]:
# API Documentation = https://developer.betterdoctor.com/documentation15
# Change the range below 30
# Put API key at put_key_here
for i in range(30):
    api_url = 'https://api.betterdoctor.com/2016-03-01/doctors/npi/{}?user_key=put_key_here'.format(npi[0][i])
    try:
        npi["state"][i] = get_account_info()
    except IndexError:
        npi["state"][i] = 'List Index Error'
    except KeyError:
        npi["state"][i] = 'Key Error'

In [15]:
npi[0:30]

Unnamed: 0,0,state
0,1295763035,LA
1,1437366804,CA
2,1316196462,NY
3,1215979554,IA
4,1174564165,PA
5,1275533044,TX
6,1912043050,
7,1992021687,
8,1043561624,
9,1639165319,FL


In [16]:
# npi is a list which stores npi number and the state
# sparse_matrix is a matrix which stores '0' and 'some number' for drug for a single observation. Making it separate helps in processing data.
# dataset is a dataframe with rest of the details
# feature_drug is a dict which stores the drugs names. This is the column name for spare_matrix