This file contains some functions for the QC steps. Functions are tested on an APC table.
I assume the tables from different years are merged.

# Connection

In [171]:
import pandas as pd
import pyodbc
import sqlalchemy
import numpy as np

In [172]:
def connect():
    return pyodbc.connect(
        'DRIVER={/usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so}; SERVER=192.168.5.78;'
        'DATABASE=IMS-NHSDigital;UID=sa;PWD=2HsxpmkDLSlHlT;port=1433;'
        'TDS_Version=8.0;')
engine = sqlalchemy.create_engine('mssql+pyodbc://', creator=connect)
conn = engine.connect()

  "No driver name specified; "


# Check on positive patients

Check that positive patients in IMS-NHSDigital database corresponds to the positive patients in Shieffield database (table from Flora) in terms of:

* HES ID
* birth year
* gender
* whether or not they had right heart catherer at Sheffield

## Check that the HES IDs are the same

In [173]:
#collect unique HES_IDs from NHSDigital
#Tablename will be something like APC_positive
tablename='dbo.NIC58999_APC_Linkage_201299'
label_ID= 'ENCRYPTED_HESID'

query= 'SELECT ' + label_ID +' from ' + tablename

positive_IDs= pd.read_sql(query, conn)['ENCRYPTED_HESID'].unique()

In [174]:
#read ID columns from reference file
Sheffdb_tablename= '/home/vsalvatelli/SharedData/Patient_group_file_SQL_v2.xlsx'
#there will be a 'ENCRYPTED_HESID' column
Sheffdb_label_ID= 'STHFT_Study_ID'

#these should be already unique IDs, but just in case
Sheffield_pos_IDs=pd.read_excel(Sheffdb_tablename)[Sheffdb_label_ID].unique()

In [175]:
#basic test: is the number of patients the same?
len(positive_IDs)==len(Sheffield_pos_IDs)

#second test: are the IDs the same?
#if the two lists are identical this returns True
positive_IDs == Sheffield_pos_IDs

  


False

In [176]:
#if the previous test fails, we can check at least if Sheffield_pos_IDs is a subset of positive_IDs (or the opposite)
set(positive_IDs).issubset(set(Sheffield_pos_IDs))

False

## Check that birth date, gender and RHC@Sheffield are the same

In [177]:
#collect info about positives from NHSDigital
#Tablename will be something like APC_positive
tablename='dbo.NIC58999_APC_Linkage_201299'
label_ID= 'ENCRYPTED_HESID'
date_birth= 'MYDOB'
gender='SEX'
hospital= 'PROCODE3'
opertn= ["OPERTN_0" + str(i+1) for i in range(9)] + ["OPERTN_" + str(i+10) for i in range(15)]

cols= [label_ID] + [date_birth] + [gender] + [hospital] + opertn
query= 'SELECT ' + ','.join(cols) +' from ' + tablename 

df_NHS= pd.read_sql(query, conn)

In [178]:
print(df_NHS.shape)
df_NHS.head(5)

(13266, 28)


Unnamed: 0,ENCRYPTED_HESID,MYDOB,SEX,PROCODE3,OPERTN_01,OPERTN_02,OPERTN_03,OPERTN_04,OPERTN_05,OPERTN_06,...,OPERTN_15,OPERTN_16,OPERTN_17,OPERTN_18,OPERTN_19,OPERTN_20,OPERTN_21,OPERTN_22,OPERTN_23,OPERTN_24
0,18BAC692C0C58B5EE7E7DFD8E6C843CC,121938,1,RWP,-,,,,,,...,,,,,,,,,,
1,F6C2FA7F10D852FD1B9308FF1F0437A9,61945,2,RFS,M479,,,,,,...,,,,,,,,,,
2,E2D507B62FFCEFBD2537D328D6539683,61934,2,RR8,-,,,,,,...,,,,,,,,,,
3,950075F6906AA9E50AFAADD0023C818A,31959,1,REM,X352,,,,,,...,,,,,,,,,,
4,F714475A05E6B6CD6ECE53EFCDD1A8C7,81960,2,RRF,X823,,,,,,...,,,,,,,,,,


In [179]:
# RHQ is the procode3 for Sheffield Teaching Hospitals, 
#is this the right one? Or we need something more specific at 5 digit level?
SHF_procode= 'RHQ'
print(len(df_NHS['ENCRYPTED_HESID'].unique()))
print(sum(df_NHS['PROCODE3']== SHF_procode))

3376
4082


In [180]:
df_NHS['PROCODE3'].value_counts().head(5)

RHQ    4082
RR8     595
RW3     571
RR1     413
REM     406
Name: PROCODE3, dtype: int64

In [181]:
#K65 corresponds to catherisation of hearth
#K65.2 (dot not included in the field) is "catherisation of right side of heart NEC"
RHC_code= 'K652'
#checking that this code appears
res=list()
for name in opertn:
    res.append(sum(df_NHS[name]== RHC_code))
print(res)
print('Total number of patients that did a RHC in {}: {}'.format(tablename,sum(res)))

[612, 25, 3, 3, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Total number of patients that did a RHC in dbo.NIC58999_APC_Linkage_201299: 653


In [188]:
#add column that says if there is RHC in the opertn cols
#tried to automatically create list of conditions ... to much time to code
#conditions=list()
#for name in opertn:
#    conditions.append(''.join(['df_NHS["', name,'"]', '== "'  ,RHC_code,'"']))

#seems reasonable to check only the first 8 cols (they are in order of priority)
cond1= df_NHS["OPERTN_01"]== RHC_code
cond2= df_NHS["OPERTN_02"]== RHC_code
cond3= df_NHS["OPERTN_03"]== RHC_code
cond4= df_NHS["OPERTN_04"]== RHC_code
cond5= df_NHS["OPERTN_05"]== RHC_code
cond6= df_NHS["OPERTN_06"]== RHC_code
cond7= df_NHS["OPERTN_07"]== RHC_code
cond8= df_NHS["OPERTN_08"]== RHC_code

#df_NHS['RHC']=df_NHS[(cond1) | (cond2)]
#df_NHS['RHC']=np.where(df_NHS[(cond1) | (cond2)| (cond3) |(cond4) | (cond5) | (cond6)| (cond7) | (cond8)], 1,0)
df_NHS['RHC']=cond1 | cond2 | cond3 | cond4 | cond5 | cond6 |cond7 |cond8
df_NHS['RHC'].value_counts()

False    12613
True       653
Name: RHC, dtype: int64

In [201]:
#positive patients are those withRHC ==True
#there are patients that did more than one RHC --> which one we have to keep?
len(df_NHS[df_NHS['RHC']==True].ENCRYPTED_HESID)

653

In [203]:
#finding atients that did a RHC at Sheffield
cond1= df_NHS['RHC'] == True
cond2= df_NHS['PROCODE3']==SHF_procode
df_NHS['RHC_at_SHF']= cond1 & cond2
df_NHS['RHC_at_SHF'].value_counts()

False    12664
True       602
Name: RHC_at_SHF, dtype: int64

In [228]:
df_NHS_final=df_NHS.drop(opertn,axis=1)
df_NHS_final= df_NHS_final.drop('PROCODE3',axis=1)
print(df_NHS_final.shape)
print(len(df_NHS_final['ENCRYPTED_HESID'].unique()))
df_NHS_final.head(2)

(13266, 5)
3376


Unnamed: 0,ENCRYPTED_HESID,MYDOB,SEX,RHC,RHC_at_SHF
0,18BAC692C0C58B5EE7E7DFD8E6C843CC,121938,1,False,False
1,F6C2FA7F10D852FD1B9308FF1F0437A9,61945,2,False,False


### This doesn't work!!!

In [229]:
#manipulation on the date of birth to extract the year
df_NHS_final['MYDOB']=pd.to_datetime(df_NHS_final['MYDOB'],dayfirst=True)
                                     #,format='%m%d%Y', errors='ignore')
df_NHS_final['birth_year']=df_NHS_final['MYDOB'].apply(lambda x: x.year)

In [230]:
df_NHS_final['birth_year'].head(2)

0    2038
1    2045
Name: birth_year, dtype: int64

### Collect info about positives in Sheffield database

In [8]:
Sheffdb_tablename= '/home/vsalvatelli/SharedData/Patient_group_file_SQL_v2.xlsx'
#there will be a 'ENCRYPTED_HESID' column
Sheffdb_label_ID= 'STHFT_Study_ID'
date_birth='Date_of_birth'
gender='Sex$Item'
opertn= 'Flag_RHC'
hospital= 'District'

#these should be already unique IDs, but just in case
Sheffield_pos_IDs=pd.read_excel(Sheffdb_tablename)[Sheffdb_label_ID].unique()

In [9]:
list(Sheffield_pos_IDs)

['STHFT_Study_ID',
 'Infoflex',
 'Sphinx',
 'PhResearchDatabase',
 'Infoflex_patient_primarykey',
 'Sthnumber$Text',
 'Disease',
 'PH_subgroup',
 'PH_Cohort',
 'Cohort_diagnosis_group',
 'Pathway_Group',
 'Age_group',
 'Sex$Item',
 'Whofunctionalclass$Item',
 'Dx_pre_visit',
 'GP_postcode',
 'Region',
 'District_Code',
 'District',
 'Flag_RHC',
 'CI_category',
 'CH_post_2009_referral',
 'CH_post_2009_visit',
 'CH_post_2013_referral',
 'CH_post_2013_visit',
 'Date_of_birth',
 'Date_of_death',
 'Date_1st_referral',
 'Date_1st_visit',
 'Date_1st_RHC',
 'Date_1st_diagnosis']

In [7]:
#first check
#Are the IDs the same?


3376