## Prediction
- Load data for one surgery (from file / or Dataframe via Streamlit)
- Add Weather via API
- Load Disease Register and merge with appointment data

In [1]:
from showupforhealth.params import *
from showupforhealth.utils import *
from showupforhealth.ml_functions.preprocessor import *
from showupforhealth.ml_functions.disease_register import *
from showupforhealth.ml_functions.data import *
from showupforhealth.interface import *

import pandas as pd 

In [2]:
# "magic commands" to enable autoreload of your imported packages
%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 5000
pd.options.display.max_columns = 5000

In [178]:
def prediction_add_weather(surgery_prefix):
    print(f'🌤️ Prediction data: {surgery_prefix} - preparing appointment data')
    data = pd.read_csv(f'{PREDICT_DATA}/{surgery_prefix}_predict.csv')
    data['weather_time'] = data['Appointment time'].str.split('-').str[0]
    data['weather_datetime'] = data['Appointment date'] + " " + data['weather_time']
    data['Appointment date'] = pd.to_datetime(data['Appointment date'])
    data['weather_datetime'] = pd.to_datetime(data['weather_datetime'])
    start_date = data['Appointment date'].dt.strftime('%Y-%m-%d').min()
    end_date = data['Appointment date'].dt.strftime('%Y-%m-%d').max()
    
    # Getting API Data
    # Define the base URL of the API
    base_url = 'https://api.open-meteo.com/v1/forecast'

    # Define the parameters as a dictionary
    params = {
        'latitude': '51.5085',
        'longitude': '0.1257',
        'hourly': 'temperature_2m,precipitation',
        'start_date': start_date,
        'end_date': end_date
        # Add more parameters as needed
    }

    # Make the API call using the requests library
    response = requests.get(base_url, params=params)
    print(f'🛜 Requesting forcast from Open-Meteo Weather API {start_date} - {end_date}')
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse and work with the API response, which is typically in JSON format
        api_data = response.json()['hourly']
        # Now you can work with the data returned by the API
        api_data = pd.DataFrame(api_data)
    else:
        # If the request was not successful, handle the error
        print(f"Error: {response.status_code}")
        print(response.text)
        
    api_data = api_data.rename(columns={'time': 'weather_datetime', 'temperature_2m': 'temp', 'precipitation': 'precipitation'})
    api_data['weather_datetime'] = pd.to_datetime(api_data['weather_datetime'])
    
    # Merging dataframes
    print(f'🔂 Merge weather + appointment data')
    df = data.merge(api_data, how='left', on='weather_datetime')
    nanindf = df.isna().sum()
    
    if nanindf.sum() > 0:
        print(f'❌ NaN values in df - ERROR: {nanindf}')
    else:
        print('✅ Done')
        return df
    

In [181]:
surgery_prefix = input('Input Surgery Prefix: ')

Input Surgery Prefix: HPVM


In [182]:
prediction_add_weather(surgery_prefix)

🌤️ Prediction data: HPVM - preparing appointment data
🛜 Requesting forcast from Open-Meteo Weather API 2023-08-21 - 2023-09-11
🔂 Merge weather + appointment data
✅ Done


Unnamed: 0,Appointment booked date,Appointment date,Appointment time,Booked by,Clinician,Rota type,Patient ID,weather_time,weather_datetime,temp,precipitation
0,08-Aug-23,2023-08-21,15:00 - 15:59,"HAYNES, Madeleine (Miss)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,47490525,15:00,2023-08-21 15:00:00,24.4,0.0
1,09-Aug-23,2023-08-22,14:00 - 14:59,"HAYNES, Madeleine (Miss)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,47483285,14:00,2023-08-22 14:00:00,24.4,0.0
2,10-Aug-23,2023-08-22,16:00 - 16:59,"EDUVIE, Savannah (Miss)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,47489000,16:00,2023-08-22 16:00:00,25.3,0.0
3,11-Aug-23,2023-09-05,14:00 - 14:59,"RAHMAN, Saidur (IT/ Informatics)","SAFI, Khushhal (Mr)",Health Partners at VM HCA,48462081,14:00,2023-09-05 14:00:00,28.4,0.0
4,14-Aug-23,2023-08-21,09:00 - 09:59,"PARMAR, Priti (Reception)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,55204923,09:00,2023-08-21 09:00:00,17.2,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2727,11-Sep-23,2023-09-11,17:00 - 17:59,"PARMAR, Priti (Reception)","SQUIER, William (Dr)",Health Partners at VM GP,51076247,17:00,2023-09-11 17:00:00,24.1,0.0
2728,11-Sep-23,2023-09-11,17:00 - 17:59,"SAFI, Khushhal (Mr)","GRIMSTONE, Anna (Dr)",Health Partners at VM GP,47482443,17:00,2023-09-11 17:00:00,24.1,0.0
2729,11-Sep-23,2023-09-11,17:00 - 17:59,"GRIMSTONE, Anna (Dr)","GRIMSTONE, Anna (Dr)",Health Partners at VM GP,47488807,17:00,2023-09-11 17:00:00,24.1,0.0
2730,11-Sep-23,2023-09-11,17:00 - 17:59,"GRIMSTONE, Anna (Dr)","GRIMSTONE, Anna (Dr)",Health Partners at VM GP,25144018,17:00,2023-09-11 17:00:00,24.1,0.0


### Prepare input frame

### Get weather prediction - call API

In [132]:
import requests

# Define the base URL of the API
base_url = 'https://api.open-meteo.com/v1/forecast'

# Define the parameters as a dictionary
params = {
    'latitude': '51.5085',
    'longitude': '0.1257',
    'hourly': 'temperature_2m,precipitation',
    'start_date': start_date,
    'end_date': end_date
    # Add more parameters as needed
}

# Make the API call using the requests library
response = requests.get(base_url, params=params)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse and work with the API response, which is typically in JSON format
    api_data = response.json()['hourly']
    # Now you can work with the data returned by the API
    api_data = pd.DataFrame(api_data)
else:
    # If the request was not successful, handle the error
    print(f"Error: {response.status_code}")
    print(response.text)


In [133]:
api_data = api_data.rename(columns={'time': 'weather_datetime', 'temperature_2m': 'temp', 'precipitation': 'precipitation'})

In [134]:
api_data['weather_datetime'] = pd.to_datetime(api_data['weather_datetime'])

In [135]:
api_data.head()

Unnamed: 0,weather_datetime,temp,precipitation
0,2023-08-31 00:00:00,14.1,0.0
1,2023-08-31 01:00:00,13.4,0.0
2,2023-08-31 02:00:00,12.8,0.0
3,2023-08-31 03:00:00,12.1,0.0
4,2023-08-31 04:00:00,11.7,0.0


### add Weather data to input frame

In [136]:
data = data.merge(api_data, how='left', on='weather_datetime')

In [137]:
data.head()

Unnamed: 0,Appointment booked date,Appointment date,Appointment time,Booked by,Clinician,Rota type,Patient ID,weather_time,weather_datetime,temp,precipitation
0,09-Aug-23,2023-09-17,09:00 - 09:59,"HIPPOLYTE, Christine (Ms)","SAS OMILIANI, Locum",Earls Court Surgery HCA,19581268,09:00,2023-09-17 09:00:00,20.6,0.0
1,09-Aug-23,2023-09-17,10:00 - 10:59,"HIPPOLYTE, Christine (Ms)","SAS OMILIANI, Locum",Earls Court Surgery HCA,17412911,10:00,2023-09-17 10:00:00,20.6,0.6
2,09-Aug-23,2023-09-17,09:00 - 09:59,"JAYATISSA, Rekha (Mrs)","SAS OMILIANI, Locum",Earls Court Surgery HCA,19579948,09:00,2023-09-17 09:00:00,20.6,0.0
3,09-Aug-23,2023-09-17,09:00 - 09:59,"HIPPOLYTE, Christine (Ms)","SAS OMILIANI, Locum",Earls Court Surgery HCA,19579479,09:00,2023-09-17 09:00:00,20.6,0.0
4,09-Aug-23,2023-09-17,10:00 - 10:59,"JAYATISSA, Rekha (Mrs)","SAS OMILIANI, Locum",Earls Court Surgery HCA,17412397,10:00,2023-09-17 10:00:00,20.6,0.6


In [138]:
data.isna().sum()

Appointment booked date    0
Appointment date           0
Appointment time           0
Booked by                  0
Clinician                  0
Rota type                  0
Patient ID                 0
weather_time               0
weather_datetime           0
temp                       0
precipitation              0
dtype: int64

### Add Disease Register to input frame

In [None]:
def predict_add_global_register(df):
    print(f'Appointment DF in: {df.shape}')
    register = pd.read_csv(f'{OUTPUT_DATA}/global_disease_register.csv', dtype='str')
    predict_df = df.merge(regiser, how='left', on='Patient ID')
    print('🔀 Disease Register merged with prediction data')
    
    return predict_df

Unnamed: 0,NHS number,Patient ID,Age in years,Postcode,Sex,Registration date,Ethnicity category,Language,Registration status,FRAILTY,...,IHD,DM,HPT,NDHG,SMI,Latitude,Longitude,IMD2023,dist_to_station,distance_to_surg
0,3369850591,54375936,30,SW5 9UJ,Male,2021-04-22 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Deducted,0.0,...,0,0,0,0,0,51.48914,-0.19286,16808.0,0.266031,0.0827083024043814
1,3380959626,39934971,29,SW10 9ED,Female,2021-12-15 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Deducted,0.0,...,0,0,0,0,0,51.486157,-0.189536,12935.0,0.430928,0.3278224523044783
2,3487638886,19581387,43,HA3 5DS,Male,2008-04-30 00:00:00,Irish,(XaG5t) Main spoken language English,Current,0.03,...,0,0,0,0,0,51.59856,-0.338791,16117.0,0.733226,15.8869684852339
3,3662712466,21999599,33,SW10 9JT,Male,2016-09-08 00:00:00,Irish,(XaG5t) Main spoken language English,Deducted,0.0,...,0,0,0,0,0,51.487298,-0.187128,24135.0,0.58198,0.3646556786251257
4,3691463207,57302966,24,SW5 0EN,Female,2023-05-25 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Current,0.0,...,0,0,0,0,0,51.492328,-0.191409,16808.0,0.228139,0.4023645871127948


In [185]:
predict_df = df.merge(regiser, how='left', on='Patient ID')

NameError: name 'df' is not defined

In [197]:
from showupforhealth.ml_functions.predict import *
data = predict_add_weather('HPVM')
df = predict_add_global_register(data)

🌤️ Prediction: HPVM - preparing appointment data for weather
🛜 Requesting forcast from Open-Meteo Weather API 2023-08-21 - 2023-09-11
🔂 Merge weather + appointment data
✅ Successful: return df
👉 Appointment DF in: (2732, 11)
🔀 Disease Register merged with prediction data - (4020, 32)
❌ Drop NaN = 297


In [195]:
df.isna().sum()

Appointment booked date      0
Appointment date             0
Appointment time             0
Booked by                    0
Clinician                    0
Rota type                    0
Patient ID                   0
weather_time                 0
weather_datetime             0
temp                         0
precipitation                0
NHS number                 297
Age in years               297
Postcode                   297
Sex                        297
Registration date          297
Ethnicity category         297
Language                   297
Registration status        297
FRAILTY                    297
DEPRESSION                 297
OBESITY                    297
IHD                        297
DM                         297
HPT                        297
NDHG                       297
SMI                        297
Latitude                   297
Longitude                  297
IMD2023                    297
dist_to_station            297
distance_to_surg           297
dtype: i

In [196]:
df.duplicated().sum()

0

# Extract No shows from Training set 

In [18]:
data = pd.read_csv(f'{OUTPUT_DATA}/full_train_data.csv')

In [19]:
data.head()

Unnamed: 0,Appointment_status,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,No_shows,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
0,1.0,19580589.0,17.7,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22698.0,0.519419,0.816996,54.0,1.0,17.0,-0.120537,-0.992709,0.707107,-0.707107,-0.5,-0.866025,0.781831,0.62349,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,47551135.0,18.5,0.0,70.0,1.0,0.39,0.0,1.0,0.0,1.0,1.0,0.0,0.0,10169.0,0.540792,1.656309,49.0,0.0,12.0,-0.120537,-0.992709,0.5,-0.866025,-0.5,-0.866025,0.781831,0.62349,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,1.0,19579593.0,19.4,0.0,66.0,1.0,0.03,1.0,0.0,0.0,0.0,1.0,0.0,0.0,8243.0,0.332691,0.600495,37.0,0.0,70.0,-0.120537,-0.992709,0.258819,-0.965926,-0.5,-0.866025,0.433884,-0.900969,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,19581430.0,18.3,0.0,58.0,0.0,0.11,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7626.0,0.155802,0.215224,27.0,0.0,28.0,-0.120537,-0.992709,0.707107,-0.707107,-0.5,-0.866025,0.433884,-0.900969,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,17407665.0,27.7,0.0,61.0,0.0,0.06,0.0,0.0,0.0,1.0,0.0,0.0,0.0,8087.0,0.451039,4.237312,38.0,0.0,30.0,-0.354605,-0.935016,0.707107,-0.707107,-0.5,-0.866025,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
data.shape

(595267, 38)

In [24]:
noshows = data[['Patient ID', 'No_shows']].astype('int')

In [25]:
noshows.head()

Unnamed: 0,Patient ID,No_shows
0,19580589,2
1,47551135,1
2,19579593,10
3,19581430,0
4,17407665,1


In [23]:
noshows.shape

(595267, 2)

In [26]:
noshows.nunique()

Patient ID    34891
No_shows         29
dtype: int64

In [27]:
# Method 2:  
unique_ids = noshows.drop_duplicates(keep='first')

In [28]:
unique_ids.shape

(34891, 2)

In [29]:
unique_ids.to_csv(f'{OUTPUT_DATA}/no_shows_db.csv', index=False)

In [216]:
noshows = pd.read_csv(f'{OUTPUT_DATA}/no_shows_db.csv')
merged = df.merge(noshows, on='Patient ID', how='left')
countin = merged.shape[0]
merged.dropna()
countout = merged.shapehape[0]
print('❌ Dropped NaN = {countin - countout}')
return merged

In [221]:
merged.head()

Unnamed: 0,Appointment booked date,Appointment date,Appointment time,Booked by,Clinician,Rota type,Patient ID,weather_time,weather_datetime,temp,precipitation,NHS number,Age in years,Postcode,Sex,Registration date,Ethnicity category,Language,Registration status,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,Latitude,Longitude,IMD2023,dist_to_station,distance_to_surg,No_shows
0,08-Aug-23,2023-08-21,15:00 - 15:59,"HAYNES, Madeleine (Miss)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,47490525,15:00,2023-08-21 15:00:00,24.4,0.0,4467149767,75,SW5 0SN,Male,2012-05-02 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Current,0.19,1,0,0,0,0,0,0,51.494989,-0.19368,15885.0,0.406034,1.9009797780653097,4.0
1,09-Aug-23,2023-08-22,14:00 - 14:59,"HAYNES, Madeleine (Miss)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,47483285,14:00,2023-08-22 14:00:00,24.4,0.0,4502331678,71,SW10 9AP,Female,2007-06-13 00:00:00,Other White,(XaG5t) Main spoken language English,Current,0.06,0,0,0,0,0,0,0,51.487884,-0.19134,14414.0,0.29701,1.3958649272472958,0.0
2,09-Aug-23,2023-08-22,14:00 - 14:59,"HAYNES, Madeleine (Miss)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,47483285,14:00,2023-08-22 14:00:00,24.4,0.0,4502331678,71,SW10 9AP,Female,2007-06-13 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Current,0.06,0,0,0,0,0,0,0,51.487884,-0.19134,14414.0,0.29701,1.3958649272472958,0.0
3,10-Aug-23,2023-08-22,16:00 - 16:59,"EDUVIE, Savannah (Miss)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,47489000,16:00,2023-08-22 16:00:00,25.3,0.0,4349776621,42,SW15 2QT,Female,2009-02-05 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Current,0.03,0,0,0,0,0,0,0,51.458232,-0.205098,20104.0,0.414799,3.725790888229243,0.0
4,10-Aug-23,2023-08-22,16:00 - 16:59,"EDUVIE, Savannah (Miss)","JALILOVA, Gulshan (Nurse)",Health Partners at VM Nurse,47489000,16:00,2023-08-22 16:00:00,25.3,0.0,4349776621,42,SW15 2QT,Female,2009-02-05 00:00:00,Other White,(XaG5t) Main spoken language English,Current,0.03,0,0,0,0,0,0,0,51.458232,-0.205098,20104.0,0.414799,3.725790888229243,0.0


In [222]:
merged.dropna(inp)

Appointment booked date     0
Appointment date            0
Appointment time            0
Booked by                   0
Clinician                   0
Rota type                   0
Patient ID                  0
weather_time                0
weather_datetime            0
temp                        0
precipitation               0
NHS number                  0
Age in years                0
Postcode                    0
Sex                         0
Registration date           0
Ethnicity category          0
Language                    0
Registration status         0
FRAILTY                     0
DEPRESSION                  0
OBESITY                     0
IHD                         0
DM                          0
HPT                         0
NDHG                        0
SMI                         0
Latitude                    0
Longitude                   0
IMD2023                     0
dist_to_station             0
distance_to_surg            0
No_shows                   82
dtype: int

In [265]:
from showupforhealth.ml_functions.predict import *

In [273]:
surgery_prefix = input('Surgery Prefix: ')
df = predict_add_weather(surgery_prefix=surgery_prefix)
predict_df = predict_add_global_register(df)
df = add_noshows(predict_df)
df = predict_feature_engineering(df)

Surgery Prefix: TGP

🌤️ Prediction: TGP - preparing appointment data for weather
🛜 Requesting forcast from Open-Meteo Weather API 2023-08-21 - 2023-09-11
🔂 Merge weather + appointment data
✅ Successful: return df
👉 Appointment DF in: (2192, 11)
🔀 Disease Register merged with prediction data - (3144, 32)
❌ Drop NaN - Disease Register Merge = 120
❌ Drop NaN - No Shows Merge = 71

🔂 Rename Columns
🔂 Drop deseased and deducted
🔂 Columns to Datetime
🔂 Fix Appointment Time
🔂 book_to_app_days
🔂 booked_by_clinician
🔂 Extract Rota Types
🔂 registered_for_months
🔂 Week
🔂 Month
🔂 Day of week
🔂 Convert Cyclical data
🔂 Drop Column no longer needed
🔂 Rows dropped from Rotas other than spec: 14
🔂 Rows from with Negative book_to_app_days: 0
🔂 Drop rows with Sex Unknonw & Indeterminate
🔂 Labelencode Column Sex
🔂 OneHotEncode Rota types
🔂 Extract Ethnicity Category
🔂 OneHotEncode Ethnicity
🔂 Drop NaN
✅ Done in 0.04 sec (2742, 37)


In [274]:
df.head()

Unnamed: 0,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,No_shows,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
0,56724742,18.9,0.0,0,0,0.0,0,0,0,0,0,0,0,18372.0,1.14939,2.9084504794527537,0.0,39.0,0,1.0,-0.822984,-0.568065,0.258819,-0.965926,-0.866025,-0.5,-0.433884,-0.900969,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,33178428,20.7,0.0,62,1,0.17,0,0,0,0,1,0,0,13213.0,0.830984,0.940781257685243,0.0,28.0,0,2.0,-0.822984,-0.568065,-0.5,-0.866025,-0.866025,-0.5,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,62165767,18.9,0.0,36,0,0.06,0,0,0,0,0,0,0,27528.0,0.66354,1.179844670384917,1.0,33.0,0,2.0,-0.885456,-0.464723,-0.258819,-0.965926,-0.866025,-0.5,0.974928,-0.222521,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,57230466,18.9,0.0,0,0,0.0,0,0,0,0,0,0,0,27528.0,0.66354,1.179844670384917,0.0,33.0,0,1.0,-0.885456,-0.464723,-0.258819,-0.965926,-0.866025,-0.5,0.974928,-0.222521,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,83773,24.5,0.0,26,1,0.06,0,0,0,0,0,0,0,4981.0,0.676391,0.2816016605548502,0.0,21.0,0,5.0,-0.822984,-0.568065,-0.5,-0.866025,-0.866025,-0.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [277]:
train = pd.read_csv(f'{OUTPUT_DATA}/full_train_data.csv')

In [278]:
train.head()

Unnamed: 0,Appointment_status,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,No_shows,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
0,1.0,19580589.0,17.7,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22698.0,0.519419,0.816996,54.0,1.0,17.0,-0.120537,-0.992709,0.707107,-0.707107,-0.5,-0.866025,0.781831,0.62349,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,47551135.0,18.5,0.0,70.0,1.0,0.39,0.0,1.0,0.0,1.0,1.0,0.0,0.0,10169.0,0.540792,1.656309,49.0,0.0,12.0,-0.120537,-0.992709,0.5,-0.866025,-0.5,-0.866025,0.781831,0.62349,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,1.0,47551135.0,18.5,0.0,70.0,1.0,0.39,0.0,1.0,0.0,1.0,1.0,0.0,0.0,10169.0,0.540792,1.656309,49.0,0.0,12.0,-0.120537,-0.992709,0.5,-0.866025,-0.5,-0.866025,0.781831,0.62349,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,1.0,19579593.0,19.4,0.0,66.0,1.0,0.03,1.0,0.0,0.0,0.0,1.0,0.0,0.0,8243.0,0.332691,0.600495,37.0,0.0,70.0,-0.120537,-0.992709,0.258819,-0.965926,-0.5,-0.866025,0.433884,-0.900969,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,19581430.0,18.3,0.0,58.0,0.0,0.11,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7626.0,0.155802,0.215224,27.0,0.0,28.0,-0.120537,-0.992709,0.707107,-0.707107,-0.5,-0.866025,0.433884,-0.900969,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [405]:
predict = pd.read_csv(f'{PREDICT_DATA}/TGP_predict.csv')

In [406]:
predict.head()

Unnamed: 0,Appointment booked date,Appointment date,Appointment time,Booked by,Clinician,Rota type,Patient ID
0,17-Jul-23,25-Aug-23,11:00 - 11:59,Ivan Petkov,Ms Maria Amparo D Belda Arribas,The Good Practice Nurse,56724742
1,17-Jul-23,11-Sep-23,10:00 - 10:59,Mrs Myra Cosio,Mrs Myra Cosio,The Good Practice Nurse,50183782
2,24-Jul-23,04-Sep-23,10:00 - 10:59,Mrs Myra Cosio,Mrs Myra Cosio,The Good Practice Nurse,49711751
3,28-Jul-23,25-Aug-23,14:00 - 14:59,Miss Abigail Kimbati,Dr Caroline Sinclair,The Good Practice GP,33178428
4,28-Jul-23,30-Aug-23,13:00 - 13:59,Ivan Petkov,Dr Maria Lazari,The Good Practice GP,62165767


In [407]:
predict['Patient ID'].shape

(2192,)

In [408]:
sorted_df = predict.sort_values(by="Patient ID")
sorted_df

Unnamed: 0,Appointment booked date,Appointment date,Appointment time,Booked by,Clinician,Rota type,Patient ID
2024,08-Sep-23,08-Sep-23,09:00 - 09:59,Dr Justin Hammond,Dr Justin Hammond,The Good Practice GP,14704
740,25-Aug-23,25-Aug-23,10:00 - 10:59,Miss Abigail Kimbati,Dr Caroline Sinclair,The Good Practice GP,14704
540,23-Aug-23,23-Aug-23,15:00 - 15:59,Ivan Petkov,Mrs Myra Cosio,The Good Practice Nurse,14704
334,21-Aug-23,04-Sep-23,16:00 - 16:59,Mrs Myra Cosio,Ms Maria Amparo D Belda Arribas,The Good Practice Nurse,14704
44,11-Aug-23,21-Aug-23,14:00 - 14:59,Miss Abigail Kimbati,Mrs Myra Cosio,The Good Practice Nurse,14704
...,...,...,...,...,...,...,...
316,21-Aug-23,24-Aug-23,15:00 - 15:59,Unknown,Dr Tom Thwaites,The Good Practice GP,62886363
944,29-Aug-23,29-Aug-23,17:00 - 17:59,Dr Justin Hammond,Dr Justin Hammond,The Good Practice GP,62886367
1421,01-Sep-23,07-Sep-23,13:00 - 13:59,Unknown,Dr Rajni Aldridge,The Good Practice GP,62886442
204,21-Aug-23,21-Aug-23,09:00 - 09:59,Ivan Petkov,Locum Gp One Elizabeth Sas Sinclair,The Good Practice GP,62907926


In [358]:
from showupforhealth.ml_functions.predict import *

In [359]:
ecs = patient_list("SMW")

Duplicates: 0
Number of appointments in df 2807
Unique Patient List length: 1665


In [369]:
data = streamlit_predict('ECS')


🌤️ Prediction: ECS - preparing appointment data for weather
🛜 Requesting forcast from Open-Meteo Weather API 2023-08-31 - 2023-09-20
🔂 Merge weather + appointment data
✅ Successful: return df
👉 Appointment DF in: (1209, 11)
🔀 Disease Register merged with prediction data - (1916, 32)
❌ Drop NaN - Disease Register Merge = 63
❌ Drop NaN - No Shows Merge = 56

🔂 Rename Columns
🔂 Drop deseased and deducted
🔂 Columns to Datetime
🔂 Fix Appointment Time
🔂 book_to_app_days
🔂 booked_by_clinician
🔂 Extract Rota Types
🔂 registered_for_months
🔂 Week
🔂 Month
🔂 Day of week
🔂 Convert Cyclical data
🔂 Drop Column no longer needed
🔂 Rows dropped from Rotas other than spec: 0
🔂 Rows from with Negative book_to_app_days: 0
🔂 Drop rows with Sex Unknonw & Indeterminate
🔂 Labelencode Column Sex
🔂 OneHotEncode Rota types
🔂 Extract Ethnicity Category
🔂 OneHotEncode Ethnicity
🔂 Drop NaN
✅ Done in 0.04 sec (1778, 37)
✅❗️TEST PASSED - df has 37 columns!


In [370]:
data.head()

Unnamed: 0,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,No_shows,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
0,19581268,19.4,0.0,67,0,0.14,0,0,0,0,0,1,0,7626.0,0.120761,0.2334414054531315,9.0,39.0,0,19.0,-0.970942,-0.239316,0.707107,-0.707107,-1.0,-1.83697e-16,-0.781831,0.62349,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,17412911,19.7,0.0,73,0,0.28,0,1,1,1,1,0,0,6478.0,0.0722999,3.914816137524795,2.0,39.0,0,52.0,-0.970942,-0.239316,0.5,-0.866025,-1.0,-1.83697e-16,-0.781831,0.62349,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,19579948,19.4,0.0,66,1,0.08,0,0,0,0,1,0,0,8243.0,0.376227,0.6330692363906465,0.0,39.0,0,38.0,-0.970942,-0.239316,0.707107,-0.707107,-1.0,-1.83697e-16,-0.781831,0.62349,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,19579479,19.4,0.0,58,1,0.03,1,0,0,1,0,0,0,1794.0,0.818463,4.513094283753243,2.0,39.0,0,45.0,-0.970942,-0.239316,0.707107,-0.707107,-1.0,-1.83697e-16,-0.781831,0.62349,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,17412397,19.7,0.0,55,1,0.0,0,0,0,0,0,0,0,12092.0,0.299485,4.87361314050634,0.0,39.0,0,1.0,-0.970942,-0.239316,0.5,-0.866025,-1.0,-1.83697e-16,-0.781831,0.62349,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [384]:
s



1143

In [379]:
styled_df

Unnamed: 0,Patient ID,Other Data
0,1,A
1,2,B
2,3,C
3,4,D
4,5,E
5,2,F
6,3,G
7,6,H
8,7,I
9,3,J


In [35]:
from showupforhealth.ml_functions.predict import *

In [38]:
df = streamlit_predict('TGP')


👩🏻‍🦰 Appointments: 2192 🧑🏻‍🦰 Unique Patient IDs; 1254
🛜 Requesting forcast from Open-Meteo Weather API 2023-08-21 - 2023-09-11
🔂 Merge weather + appointment data
✅ Successful: return df
👉 Appointment DF in: (2192, 11)
🔀 Disease Register merged with prediction data - (2192, 32)
❌ Drop NaN - Disease Register Merge = 120
🔀 Merge No_shows from csv file
❌ Drop NaN - No Shows Merge = 0

🔂 Rename Columns
🔂 Drop deseased and deducted
🔂 Columns to Datetime
🔂 Fix Appointment Time
🔂 book_to_app_days
🔂 booked_by_clinician
🔂 Extract Rota Types
🔂 registered_for_months
🔂 Week
🔂 Month
🔂 Day of week
🔂 Convert Cyclical data
🔂 Drop Column no longer needed
🔂 Rows dropped from Rotas other than spec: 0
🔂 Rows from with Negative book_to_app_days: 0
🔂 Drop rows with Sex Unknonw & Indeterminate
🔂 Labelencode Column Sex
🔂 OneHotEncode Rota types
🔂 Extract Ethnicity Category
🔂 OneHotEncode Ethnicity
🔂 Drop NaN
✅ Done in 0.04 sec (2032, 37)
❗️TEST PASSED - df has 37 columns!


In [40]:
df.head()

Unnamed: 0,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,No_shows,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
0,56724742,18.9,0.0,0,0,0.0,0,0,0,0,0,0,0,18372.0,1.14939,2.9084504794527537,0.0,39.0,0,1.0,-0.822984,-0.568065,0.258819,-0.965926,-0.866025,-0.5,-0.433884,-0.900969,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,33178428,20.7,0.0,62,1,0.17,0,0,0,0,1,0,0,13213.0,0.830984,0.940781257685243,0.0,28.0,0,2.0,-0.822984,-0.568065,-0.5,-0.866025,-0.866025,-0.5,-0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,62165767,18.9,0.0,36,0,0.06,0,0,0,0,0,0,0,27528.0,0.66354,1.179844670384917,1.0,33.0,0,2.0,-0.885456,-0.464723,-0.258819,-0.965926,-0.866025,-0.5,0.974928,-0.222521,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,57230466,18.9,0.0,0,0,0.0,0,0,0,0,0,0,0,27528.0,0.66354,1.179844670384917,0.0,33.0,0,1.0,-0.885456,-0.464723,-0.258819,-0.965926,-0.866025,-0.5,0.974928,-0.222521,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,83773,24.5,0.0,26,1,0.06,0,0,0,0,0,0,0,4981.0,0.676391,0.2816016605548502,0.0,21.0,0,5.0,-0.822984,-0.568065,-0.5,-0.866025,-0.866025,-0.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [418]:
duplicates = df[df.duplicated(subset='Patient ID', keep=False)]
duplicates


Unnamed: 0,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,No_shows,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
0,49849663,18.1,0.0,64,0,0.19,0,0,0,0,0,0,0,8812.0,0.311416,0.8409160591027163,0.0,84.0,1,22.0,-0.822984,-0.568065,0.866025,-5.000000e-01,-0.866025,-5.000000e-01,0.781831,0.623490,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,49849663,18.1,0.0,64,0,0.19,0,0,0,0,0,0,0,8812.0,0.311416,0.8409160591027163,0.0,84.0,1,22.0,-0.822984,-0.568065,0.866025,-5.000000e-01,-0.866025,-5.000000e-01,0.781831,0.623490,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,49855842,21.5,0.0,78,1,0.17,0,0,0,0,1,0,0,11573.0,0.336278,0.8291633532415561,4.0,84.0,1,34.0,-0.885456,-0.464723,-0.707107,-7.071068e-01,-1.000000,-1.836970e-16,-0.433884,-0.900969,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,49855842,21.5,0.0,78,1,0.17,0,0,0,0,1,0,0,11573.0,0.336278,0.8291633532415561,4.0,84.0,1,34.0,-0.885456,-0.464723,-0.707107,-7.071068e-01,-1.000000,-1.836970e-16,-0.433884,-0.900969,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6,49835050,17.5,0.0,76,0,0.08,0,0,0,1,1,0,0,10169.0,0.62029,1.1297748286613705,1.0,69.0,1,29.0,-0.822984,-0.568065,0.866025,-5.000000e-01,-0.866025,-5.000000e-01,0.974928,-0.222521,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2602,49832299,24.7,0.0,84,0,0.42,0,1,0,0,1,1,0,21201.0,0.259543,0.5758971472756363,4.0,0.0,0,55.0,-0.970942,-0.239316,-0.866025,-5.000000e-01,-1.000000,-1.836970e-16,0.000000,1.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2604,49834100,24.1,0.0,79,0,0.42,0,0,1,1,1,0,0,23245.0,0.578405,1.0811399520093972,3.0,0.0,0,63.0,-0.970942,-0.239316,-0.965926,-2.588190e-01,-1.000000,-1.836970e-16,0.000000,1.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2605,49834100,24.1,0.0,79,0,0.42,0,0,1,1,1,0,0,23245.0,0.578405,1.0811399520093972,3.0,0.0,0,63.0,-0.970942,-0.239316,-0.965926,-2.588190e-01,-1.000000,-1.836970e-16,0.000000,1.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2606,49834100,24.1,0.0,79,0,0.42,0,0,1,1,1,0,0,23245.0,0.578405,1.0811399520093972,3.0,0.0,0,63.0,-0.970942,-0.239316,-0.965926,-2.588190e-01,-1.000000,-1.836970e-16,0.000000,1.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [403]:
sorted_df = df.sort_values(by="Patient ID")
sorted_df

Unnamed: 0,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,No_shows,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
2334,14704,30.5,0.0,35,0,0.11,0,0,0,0,0,0,0,16091.0,0.77766,0.2509768123433546,12.0,2.0,0,5.0,-0.935016,-0.354605,-0.707107,-0.707107,-1.000000,-1.836970e-16,-0.433884,-0.900969,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
51,14704,24.5,0.0,35,0,0.11,0,0,0,0,0,0,0,16091.0,0.77766,0.2509768123433546,12.0,10.0,0,5.0,-0.822984,-0.568065,-0.500000,-0.866025,-0.866025,-5.000000e-01,0.000000,1.000000,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2333,14704,30.5,0.0,35,0,0.11,0,0,0,0,0,0,0,16091.0,0.77766,0.2509768123433546,12.0,2.0,0,5.0,-0.935016,-0.354605,-0.707107,-0.707107,-1.000000,-1.836970e-16,-0.433884,-0.900969,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
421,14704,25.2,0.0,35,0,0.11,0,0,0,0,0,0,0,16091.0,0.77766,0.2509768123433546,12.0,14.0,0,5.0,-0.935016,-0.354605,-0.866025,-0.500000,-1.000000,-1.836970e-16,0.000000,1.000000,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2521,14704,23.9,0.0,35,0,0.11,0,0,0,0,0,0,0,16091.0,0.77766,0.2509768123433546,12.0,0.0,1,5.0,-0.935016,-0.354605,0.707107,-0.707107,-1.000000,-1.836970e-16,-0.433884,-0.900969,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,62843566,21.8,0.0,22,1,0.00,0,0,0,0,0,0,0,13544.0,1.11934,0.2164411990155491,0.0,0.0,0,2.0,-0.822984,-0.568065,0.258819,-0.965926,-0.866025,-5.000000e-01,0.781831,0.623490,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1182,62886367,19.1,0.0,54,0,0.06,0,0,0,0,0,0,0,21872.0,0.715519,0.5871905797320917,0.0,0.0,1,2.0,-0.885456,-0.464723,-0.965926,-0.258819,-0.866025,-5.000000e-01,0.781831,0.623490,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1183,62886367,19.1,0.0,54,0,0.06,0,0,0,0,0,0,0,21872.0,0.715519,0.5871905797320917,0.0,0.0,1,2.0,-0.885456,-0.464723,-0.965926,-0.258819,-0.866025,-5.000000e-01,0.781831,0.623490,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1791,62886442,29.2,0.0,33,0,0.00,0,0,0,0,0,0,0,18277.0,0.786117,0.5306882041114966,0.0,6.0,0,2.0,-0.935016,-0.354605,-0.258819,-0.965926,-1.000000,-1.836970e-16,0.433884,-0.900969,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [3]:
data1 = {
    'Patient ID': [1, 2, 3, 4, 5, 2, 3, 6, 7, 3],
    'Other Data': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
}
df1 = pd.DataFrame(data1)
df1

Unnamed: 0,Patient ID,Other Data
0,1,A
1,2,B
2,3,C
3,4,D
4,5,E
5,2,F
6,3,G
7,6,H
8,7,I
9,3,J


In [4]:
data2 = {
    'Patient ID': [1, 2, 3, 4, 5, 6, 7, 7],
    'Other Data': ['1A', '2B', '3C', '4D', '5E', '6F', '7G', '77G']
}
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,Patient ID,Other Data
0,1,1A
1,2,2B
2,3,3C
3,4,4D
4,5,5E
5,6,6F
6,7,7G
7,7,77G


In [5]:
merged = df1.merge(df2, how='left', on='Patient ID')
merged

Unnamed: 0,Patient ID,Other Data_x,Other Data_y
0,1,A,1A
1,2,B,2B
2,3,C,3C
3,4,D,4D
4,5,E,5E
5,2,F,2B
6,3,G,3C
7,6,H,6F
8,7,I,7G
9,7,I,77G


In [3]:
all = pd.read_csv(f'{OUTPUT_DATA}/global_disease_register.csv')

  all = pd.read_csv(f'{OUTPUT_DATA}/global_disease_register.csv')


In [4]:
len(all['Patient ID'].unique().tolist())

57786

In [5]:
all.shape

(74420, 22)

In [6]:
all.duplicated().sum()

0

In [7]:
all.sort_values(by='Patient ID')

Unnamed: 0,NHS number,Patient ID,Age in years,Postcode,Sex,Registration date,Ethnicity category,Language,Registration status,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,Latitude,Longitude,IMD2023,dist_to_station,distance_to_surg
21923,4244939954,14704,35,SW10 0LD,Female,2021-03-12 00:00:00,Indian or British Indian,(XaG5t) Main spoken language English,Current,0.11,0,0,0,0,0,0,0,51.481896,-0.181481,16091.0,0.777660,0.250977
21922,4244939954,14704,35,SW10 0LD,Female,2021-03-12 00:00:00,Pakistani or British Pakistani,(XaG5t) Main spoken language English,Current,0.11,0,0,0,0,0,0,0,51.481896,-0.181481,16091.0,0.777660,0.250977
26218,6240260134,52153,57,SW10 0NH,Male,2021-01-21 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Current,0.03,0,0,0,0,0,0,0,51.480563,-0.180756,9898.0,0.640157,0.297752
18645,6376613101,62965,22,SW10 0NH,Male,2021-01-21 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Current,0.06,0,0,0,0,0,0,0,51.480563,-0.180756,9898.0,0.640157,0.297752
29493,4122528445,64081,34,SW5 0NJ,Male,2020-04-09 00:00:00,Other White,(XaG5t) Main spoken language English,Current,0.00,0,0,0,0,0,0,0,51.492893,-0.186661,18346.0,0.279953,0.371664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8800,6481552672,62908567,20,SW3 5AT,Male,2023-03-27 00:00:00,British or Mixed British,(XaG5t) Main spoken language English,Current,0.00,0,0,0,0,0,0,0,51.483433,-0.172792,13544.0,1.170870,0.143103
57258,7244954956,62908806,22,SW1X 8HU,Male,2023-02-21 00:00:00,Other White,(XaG5t) Main spoken language English,Current,0.00,0,0,0,0,0,0,0,51.498256,-0.155156,23381.0,0.513568,0.330289
21642,7282071160,62910975,33,SW10 0JZ,Female,2022-11-30 00:00:00,Other White,(XaG5t) Main spoken language English,Current,0.00,0,0,0,0,0,0,0,51.483631,-0.183169,16091.0,0.934690,0.369751
6900,7250684094,62911566,29,SW10 9AW,Male,2023-08-22 00:00:00,Other White,(XaG5t) Main spoken language English,Current,0.00,0,0,0,0,0,0,0,51.487373,-0.190813,14414.0,0.327030,0.166894


In [8]:
all.drop_duplicates(subset='Patient ID', keep='first', inplace=True)


In [9]:
all.shape

(57786, 22)

In [14]:
noshows = pd.read_csv(f'{OUTPUT_DATA}/no_shows_db.csv', dtype='int')

In [15]:
noshows.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34323 entries, 0 to 34322
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Patient ID  34323 non-null  int64
 1   No_shows    34323 non-null  int64
dtypes: int64(2)
memory usage: 536.4 KB


In [16]:
noshows


Unnamed: 0,Patient ID,No_shows
0,19580589,2
1,47551135,2
2,19579593,10
3,19581430,0
4,17407665,2
...,...,...
34318,47561613,1
34319,31744430,0
34320,53146052,0
34321,47486790,0


In [17]:
noshows['Patient ID'].duplicated().sum()

0

In [31]:
from showupforhealth.ml_functions.predict import *

In [34]:
data = streamlit_predict('HPVM')


👩🏻‍🦰 Appointments: 2732 🧑🏻‍🦰 Unique Patient IDs; 1295
🛜 Requesting forcast from Open-Meteo Weather API 2023-08-21 - 2023-09-11
🔂 Merge weather + appointment data
✅ Successful: return df
👉 Appointment DF in: (2732, 11)
🔀 Disease Register merged with prediction data - (2732, 32)
❌ Drop NaN - Disease Register Merge = 297
🔀 Merge No_shows from csv file
❌ Drop NaN - No Shows Merge = 0

🔂 Rename Columns
🔂 Drop deseased and deducted
🔂 Columns to Datetime
🔂 Fix Appointment Time
🔂 book_to_app_days
🔂 booked_by_clinician
🔂 Extract Rota Types
🔂 registered_for_months
🔂 Week
🔂 Month
🔂 Day of week
🔂 Convert Cyclical data
🔂 Drop Column no longer needed
🔂 Rows dropped from Rotas other than spec: 0
🔂 Rows from with Negative book_to_app_days: 0
🔂 Drop rows with Sex Unknonw & Indeterminate
🔂 Labelencode Column Sex
🔂 OneHotEncode Rota types
🔂 Extract Ethnicity Category
🔂 OneHotEncode Ethnicity
🔂 Drop NaN
✅ Done in 0.04 sec (2366, 37)
❗️TEST PASSED - df has 37 columns!
