In [1]:
# Importing default Libraries
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns
import warnings

# Hi-resolution Plots and Matplotlib inline
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Set the maximum number of rows and columns to be displayed
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
warnings.filterwarnings('ignore')

# "magic commands" to enable autoreload of your imported packages
%load_ext autoreload
%autoreload 2

In [2]:
RAW_DATA = '~/code/janduplessis883/data-showup/data/raw-data/'
WEATHER_DATA = '~/code/janduplessis883/data-showup/data/weather/weather.csv'
IMD_DATA = '~/code/janduplessis883/data-showup/data/imd-master/imd_master.csv'

OUTPUT_DATA = '~/code/janduplessis883/data-showup/data/output-data/'

In [66]:
def combine_clinical(surgery_list = ['TCP', 'TGP', 'SMW', 'KMC', 'HPVM', 'ECS']):
    df_list = []
    for surgery_prefix in surgery_list:
        print(surgery_prefix)
        clinical = pd.read_csv(f'{RAW_DATA}{surgery_prefix}/{surgery_prefix}_CLINICAL.csv')
        df_list.append(clinical)
        
    full_clinical = pd.concat(df_list, axis=0, ignore_index=True)
    print(f'Full Clinical List - {full_clinical.shape}')  
    print('Drop Duplicates')
    full_clinical.drop_duplicates()
    full_clinical.to_csv(f'{OUTPUT_DATA}full_clinical.csv', index=False)
    return full_clinical

In [67]:
full_clinical = combine_clinical()

TCP
TGP
SMW
KMC
HPVM
ECS
Full Clinical List - (127634, 12)
Drop Duplicates


In [68]:
disease = pd.read_csv(f'{OUTPUT_DATA}global_disease_register.csv')

In [69]:
disease.head()

Unnamed: 0,NHS number,Patient ID,Age in years,Postcode,Sex,Registration date,Ethnicity category,Language,Registration status,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,Latitude,Longitude,IMD2023,dist_to_station,distance_from_surg
0,3369850591,54375936,30,SW5 9UJ,Male,22/04/2021 00:00,British or Mixed British,(XaG5t) Main spoken language English,Deducted,0.0,0,0,0,0,0,0,0,51.48914,-0.19286,16808,0.266031,0.082708
1,3380959626,39934971,29,SW10 9ED,Female,15/12/2021 00:00,British or Mixed British,(XaG5t) Main spoken language English,Deducted,0.0,0,0,0,0,0,0,0,51.486157,-0.189536,12935,0.430928,0.327822
2,3487638886,19581387,43,HA3 5DS,Male,30/04/2008 00:00,Irish,(XaG5t) Main spoken language English,Current,0.03,0,0,0,0,0,0,0,51.59856,-0.338791,16117,0.733226,15.886968
3,3662712466,21999599,33,SW10 9JT,Male,08/09/2016 00:00,Irish,(XaG5t) Main spoken language English,Deducted,0.0,0,0,0,0,0,0,0,51.487298,-0.187128,24135,0.58198,0.364656
4,3691463207,57302966,24,SW5 0EN,Female,25/05/2023 00:00,British or Mixed British,(XaG5t) Main spoken language English,Current,0.0,0,0,0,0,0,0,0,51.492328,-0.191409,16808,0.228139,0.402365


In [70]:
disease.shape

(7338, 22)

In [71]:
link = disease[['NHS number', 'Patient ID', 'Registration status']]
link

Unnamed: 0,NHS number,Patient ID,Registration status
0,3369850591,54375936,Deducted
1,3380959626,39934971,Deducted
2,3487638886,19581387,Current
3,3662712466,21999599,Deducted
4,3691463207,57302966,Current
...,...,...,...
7333,7314609810,57474657,Current
7334,7316268751,56951414,Current
7335,7316327731,57128187,Current
7336,7316476796,57437675,Current


In [74]:
link.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7338 entries, 0 to 7337
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   NHS number           7338 non-null   int64 
 1   Patient ID           7338 non-null   int64 
 2   Registration status  7338 non-null   object
dtypes: int64(2), object(1)
memory usage: 172.1+ KB


In [75]:
full_clinical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127634 entries, 0 to 127633
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   NHS number  126325 non-null  object
 1   WEIGHT      75942 non-null   object
 2   HEIGHT      98632 non-null   object
 3   DEPRESSION  117773 non-null  object
 4   SMI         127634 non-null  object
 5   BP          89772 non-null   object
 6   HBA1C       49390 non-null   object
 7   DM          127634 non-null  object
 8   IHD         127634 non-null  object
 9   WEIGHT      27862 non-null   object
 10  Depression  9861 non-null    object
 11  TYPE REG    9861 non-null    object
dtypes: object(12)
memory usage: 11.7+ MB


In [76]:
full_clinical['NHS number'] = full_clinical['NHS number'].astype('int')

ValueError: invalid literal for int() with base 10: '712 531 0184'

In [73]:
final = link.merge(full_clinical, on="NHS number", how='left')

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [64]:
final.tail(100)

Unnamed: 0,NHS number,WEIGHT,HEIGHT,DEPRESSION,SMI,BP,HBA1C,DM,IHD,WEIGHT.1,Depression,TYPE REG,Patient ID,Registration status
0,?748433,95 Kg,1.85 m,,,127 / 88,,,,,,GMS,19581607,Current
1,?510243,59 Kg,1.62 m,,,130 / 80,,,,,,GMS,17407912,Current
2,?384557,69 Kg,1.79 m,,,127 / 91,,,,,,GMS,17412852,Current
3,?493228,70 Kg,1.73 m,,,120 / 70,,,,,,GMS,17407922,Current
4,?521958,52 Kg,1.68 m,,,123 / 78,,,,,,GMS,19579898,Current
5,?491533,30 Kg,1.35 m,,,,,,,,,GMS,17407925,Current


In [33]:
final.to_csv(f'{OUTPUT_DATA}full_clinical.csv', index=False)