In [99]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [100]:
def fsave(df,df_name):
    writer = pd.ExcelWriter(
        f'{df_name}.xlsx', engine='xlsxwriter')
    df.to_excel(writer,sheet_name='ft')
    writer.save()
    print(f'{df_name}.xlsx has been saved sucessfully.')
    
def check_miss(df):
    vars_with_missing = []
    vars_50_plus = []
    for f in df.columns:
        missings = df[f].isnull().sum()
        if missings > 0:
            vars_with_missing.append(f)
            missings_perc = missings/df.shape[0]
            
            if missings_perc > 0.5:
                vars_50_plus.append(f)
                
            #print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
    
    print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))
    print('In total, there are {} variables with more than 50% of missing values'.format(len(vars_50_plus)))
    return vars_50_plus

def imp(df):
    print(f'Importing {df} dataframe')
    df = pd.read_excel(df)
    print(f'Imported {df} dataframe')
    return df

def check_distribution(df):
    return ('\n',df.info(),'\n-Skewness:\n',df.skew(),'\n-Kurtosis\n',df.kurtosis())


In [125]:
data = pd.read_csv('hakadata2.csv')
data.dropna(inplace=True)

In [148]:
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

def encoding(df_e):
    '''Encoding categorical Features as numerical'''
    dfa = df_e.copy()
    le = LabelEncoder()
    var_to_encode = ['Health Insurance - Plan','Gender']
    for col in var_to_encode:
        dfa[col] = le.fit_transform(df_e[col])
        
    return dfa

def scale(df):
    '''Selecting columns used for the classifier, and scale the features.'''
    df_s = df[['Health Insurance - Plan','Latitude','Longitude','Patient Age','Gender']].copy()
    for a in df_s.columns:
         df_s[a] = (df_s[a]-min(df_s[a]))/(max(df_s[a])-min(df_s[a]))
    return df_s
    
def calc_knn(df_c,pat_char):
    '''Calculating 5-NN and predicting the prefered doctor'''
    
    def prepare(df_to_prep):
        to_pred = ['Health Insurance - Plan','Latitude','Longitude','Patient Age','Gender']
        dff = encoding(df_to_prep[to_pred]) 
        dfx = scale(dff)
        return dfx
    
    dfx = prepare(df_c)
    doctor = 'Doctor'
    dfy = df_c[doctor]
    
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(dfx,dfy)    

    #encoding(pat_char)
    d = knn.predict()
    
    return d

In [218]:
calc_knn(data,data.iloc[30:50])

array(['53-CRM-10616', '53-CRM-18391', '53-CRM-12418', '53-CRM-13234',
       '53-CRM-13062', '53-CRM-14665', '53-CRM-8381', '53-CRM-22433',
       '53-CRM-9736', '53-CRM-18949', '53-CRM-13823', '53-CRM-16864',
       '53-CRM-20310', '53-CRM-15545', '53-CRM-13233', '53-CRM-14665',
       '53-CRM-15652', '53-CRM-15548', '53-CRM-17073', '53-CRM-13187'],
      dtype=object)

### Problemática:
    -The patients inputs are being scaled between themselves everytime leading to != scales
    -Besides that we manage to read json files so the app can access it
    

In [219]:
import json

openfile=open('convertcsv (4).json')
jsondata=json.load(openfile)
dd = pd.DataFrame(jsondata)
openfile.close()
dd

Unnamed: 0,Doctor Code,Doctor Name,Doctor Telephone,Latitude,Longitude,Rating,Specialty
0,53-CRM-15495,LENIRA SOUZA VALADAO,+55 (11) 32642128,-2290556,-4706083,3,ENDOCRINOLOGISTA
1,53-CRM-7911,GEOVANNA LEA BARBOSA DE MENDON,+55 (61) 777738010,-2290556,-4706083,5,CLINICO GERAL
2,53-CRM-11813,POLIANA GARCIA VIEIRA,+55 (61) 33311563,-2290556,-4706083,2,CIRURGIA
3,53-CRM-18271,GABRIEL TAVEIRA,+55 (11) 786113605,-2290556,-4706083,3,CLINICO GERAL
4,53-CRM-13823,GUSTAVO FRANCKLIN MILWARD DE A,+55 (61) 37772124,-2290556,-4706083,3,CARDIOLOGISTA
5,53-CRM-16677,ADEWALE OKEOWO ADENIYI,+55 (61) 76760726,-2290556,-4706083,3,DERMATOLOGISTA
6,RJ-CRM-21492,NAYARA CRISTINA PEREIRA,+55 (61) 34040707,-2290556,-4706083,4,CIRURGIA
7,53-CRM-20651,LUCIANO NERY LOURENCO,+55 (61) 34040708,-2290556,-4706083,1,NEUROLOGIA
8,53-CRM-15027,ELYSE CARVALHO BORGES DOS S FI,+55 (61) 34040709,-2290556,-4706083,4,NEUROLOGIA
9,53-CRM-23266,REBECCA BARBOSA LIMA DITZEL,+55 (61) 34040710,-2290556,-4706083,3,NEUROLOGIA
