In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import json

__Obs.: The justification for a recommender system-like other than matrix 
factorization is due to the fact that attendance to physicians
are of a different nature than those of a Netflix consumer behavior.
Besides, characterizing what would constitute similar doctors
is a challenge in itself.__

## Data preprocessing

In [2]:
def read_json_dataset(archive):
    '''
    Reading the JSON file refering to datasets.
    '''
    openfile=open('{}'.format(archive))
    jsondata=json.load(openfile)
    dt= pd.DataFrame(jsondata)
    openfile.close()
    return dt

def query(doctors,specialty):
    '''
    Querying the json file for the specialty required by the patient.
    '''
    return doctors[doctors['Specialty'] == specialty]

def encoding(data):
    '''
    Encoding categorical features as numerical features.
    '''
    df = data.copy()
    le = LabelEncoder()
    var_to_encode = ['Health Insurance - Plan','Gender'] #only these features need to be encoded.
    for key in var_to_encode:
        df[key] = le.fit_transform(df[key])
    return df

'''[TO DO] Apply feature scalling after data encoding.'''

def calc_knn(data_, patient_characteristics):
    '''
    Calculating the 5-NN consultations to predict the suggested doctor for the given patient profile. 
    '''
    
    def prepare(data_to_prep):
        '''
        Separating columns used to predicte and encode.
        '''
        to_pred = ['Health Insurance - Plan','Latitude','Longitude','Patient Age','Gender']
        dff = encoding(data_to_prep[to_pred]) 
        return dff
    
    dfx = prepare(data_)
    doctor_label = 'Doctor'
    dfy = data_[doctor_label]
    
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(dfx,dfy)    

    doctor = knn.predict(prepare(patient_characteristics))
    return doctor

## Loading datasets

In [3]:
doctors_data = read_json_dataset('../doctors.json')
# knowing that the consultation dataset will be large enough, droping na's won't affect the model.
consultation_data = read_json_dataset('../consultation_data_.json').dropna()

## Patient looks for a doctor inputing his data and doctor specialty

In [4]:
patient_x = pd.DataFrame(pd.Series({
 'Gender': 'F',
 'Health Insurance - Plan': 'DOC300',
 'Latitude': -22.880488,
 'Longitude': -46.984614,
 'Patient Age': 54.0,
 'Specialty': 'DERMATOLOGISTA',
 'State': 'SP'})).T

## Get the suggested doctor

In [5]:
def find_doctor(patient):
    doctor = calc_knn(consultation_data,patient)
    return doctors_data[doctors_data['Doctor Code'] == doctor[0]]

In [6]:
find_doctor(patient_x)

Unnamed: 0,Doctor Code,Doctor Name,Doctor Telephone,Latitude,Longitude,Rating,Specialty
2,53-CRM-11813,POLIANA GARCIA VIEIRA,+55 (61) 33311563,-22.90558,-47.06085,2,CIRURGIA
