$15^{th}$ Jul 2021

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import glob, os, re
import warnings
import matplotlib.pyplot as plt
from datetime import  datetime
from ipywidgets import widgets, interactive

In [2]:
dt = datetime.today().strftime(format='%d-%m-%Y')

### **Functions**

In [3]:
def replace(string, substitutions):
    substrings = sorted(substitutions, key=len, reverse=True)
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

In [4]:
def rename_cols_subset_df(df, cols):
    df_new = df.copy()
    df_new.columns = list(map(lambda x: x.split('(')[0].strip(' _'), [col.replace(' ', '_').replace('\n', '_').upper() for col in df.columns]))
    df_cln = df_new[df_new['SAMPLE_NUMBER'].notna()]
    return df_cln[cols]

In [5]:
def cov_names(x):
    x = x.upper().replace(' ', '')
    if len(x) == 8:
        x = x.replace('COVC', 'COVC0')
    elif len(x) == 7:
        x = x.replace('COVC', 'COVC00')
    elif len(x) == 6:
        x = x.replace('COVC', 'COVC000')
    elif len(x) == 5:
        x = x.replace('COVC', 'COVC0000')
    return x

In [6]:
def my_date_parser(x):
    try:
        if type(x) == pd.Timestamp:
            return x#.strftime('%d-%m-%Y')
        if type(x) != pd.Timestamp and type(x) != (int, float, pd.NaT):
            return pd.to_datetime(x, errors='coerce', dayfirst=True)#.strftime('%d-%m-%Y')
        if x in ['None indicated', 'NIL', 'Not indicated on form ',
                    'Nil', 'None indicated', 'Leaked Sample - Empty', 'nan']:
            return pd.NaT
    except (ValueError, AttributeError):
        return pd.NaT

In [7]:
def cln_results(x):
    if isinstance(x, str):
        x = x.strip().lower()
        if 'neg' in x:
            x = x.replace(x, 'Negative')
        elif 'pos' in x:
            x = x.replace(x, 'Positive')
        elif x == 'inconclusive':
            x = x.replace(x, 'Negative')
    return x
        

### **Establish parent directory**

In [8]:
# uniqueness in directory and file names is assumed for all analyses
sars_dir = "SARS-CoV-2"
home_dir = os.getenv('HOME')
parent_dir = glob.glob(f'{home_dir}/**/{sars_dir}', recursive=True)[0]

In [9]:
cols_of_interest = ['CASE_ID','SAMPLE_NUMBER', 'NAME', 'AGE', 'AGE_UNIT', 'GENDER', 'OCCUPATION', 'NATIONALITY'
           , 'COUNTY_OF_RESIDENCE', 'SUB_COUNTY_OF_RESIDENCE', 'HAS_TRAVEL_HOSTORY', 'TRAVEL_FROM'
           , 'QUARANTINE_FACILITY/HOSPITAL/HOMESTEAD', 'SYMTOMS_SHOWN', 'SAMPLE_TYPE', 'DATE_OF_SAMPLE_COLLECTION'
           , 'DATE_SAMPLE_RECEIVED_IN_THE_LAB', 'RESULT', 'LAB_CONFIRMATION_DATE', 'VACCINATION_STATUS_Y/N'
           , 'DOSAGE_C/NC']

In [10]:
files = os.listdir(glob.glob(f'{parent_dir}/**/ResultsDBCurated', recursive=True)[0])
files = sorted(files)

In [11]:
# keyerrors = []
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, 'DATE_SAMPLE_RECEIVED_IN_THE_LAB')
# #         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [12]:
# files = ['COVID-19-Test_result_Reporting_01_07_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_01_07_2020-B.xlsx',
#  'COVID-19-Test_result_Reporting_05_09_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_05_09_2020-B.xlsx',
#  'COVID-19-Test_result_Reporting_08_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_15_07_2020.xlsx',
#  'COVID-19-Test_result_Reporting_18_06_2020.xlsx',
#  'COVID-19-Test_result_Reporting_31_07_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_31_07_2020-B.xlsx']

In [13]:
# for file in files:
#     df = pd.read_excel(f'{data_dir}/{file}', header=1)
#     try:
#         df.insert(27, 'DATE SAMPLE RECEIVED IN THE LAB', pd.NaT)
#         df.to_excel(f'/home/douso/Documents/TrendData/Results/add_col/{file}', index=False, startrow=1)
#     except:
#         raise
#         print (f'Error: {file}')
#     finally: pass

In [14]:
# keyerrors = []
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, cols_of_interest)
#         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [15]:
headers = ['TESTING LAB', 'CASE ID', 'TYPE OF CASE'
       , 'SAMPLE NUMBER', 'NAME', 'ID/PASSPORT NUMBER', 'AGE'
       , 'AGE UNIT', 'GENDER', 'PHONE NUMBER'
       , 'OCCUPATION', 'NATIONALITY', 'COUNTY OF RESIDENCE'
       , 'SUB COUNTY OF RESIDENCE', 'VILLAGE/ESTATE OF RESIDENCE', 'WARD'
       , 'COUNTY OF DIAGNOSIS', 'HAS TRAVEL HOSTORY'
       , 'TRAVEL FROM', 'CONTACT WITH CASE Y/N', 'CONFIRMED CASE NAME'
       , 'QUARANTINE FACILITY/HOSPITAL/HOMESTEAD', 'HAVE SYMPTOMS Y/N'
       , 'DATE OF ONSET OF SYMPTOMS', 'SYMTOMS SHOWN'
       , 'SAMPLE TYPE', 'DATE OF SAMPLE COLLECTION', 'DATE SAMPLE RECEIVED IN THE LAB'
       , 'RESULT', 'LAB CONFIRMATION DATE', 'VACCINATION STATUS Y/N', 'DOSAGE C/NC']

In [16]:
id_headers = ['SAMPLE_NUMBER', 'NAME', 'ID/PASSPORT_NUMBER', 'AGE',
       'AGE_UNIT', 'GENDER', 'PHONE_NUMBER', 'RESULT',
       'LAB_CONFIRMATION_DATE']

In [17]:
# for file in keyerrors[:-1]:
#     df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#     df_file.columns = headers
#     df_file.to_excel(f'{data_dir}/add_col2/{file}', startrow=1)

The following 60 sheets, besides some June sheets (manually fixed earlier), lacked the `DATE_SAMPLE_RECEIVED_IN_THE_LAB` column, it was added for uniformity:

- 'COVID-19-Test_result_Reporting_01_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_01_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_02_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_06_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_06_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_08_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_08_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_09_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_09_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_10_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_11_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_12_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_13_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_13_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_14_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_14_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_15_08_2020-A.xlsx'
- 'COVID-19-Test_result_Reporting_15_08_2020-B.xlsx'
- 'COVID-19-Test_result_Reporting_16_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_16_10_2020 (1).xlsx'
- 'COVID-19-Test_result_Reporting_17_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_17_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_18_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_18_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_19_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_19_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_20_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_20_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_22_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_23_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_08_2020-A.xlsx'
- 'COVID-19-Test_result_Reporting_24_08_2020-B.xlsx'
- 'COVID-19-Test_result_Reporting_25_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_25_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_08_2020.xlsx'

In [18]:
# for file in files:
#     df = pd.read_excel(f'{data_dir}/{file}', header=1)
#     try:
#         df.insert(27, 'DATE SAMPLE RECEIVED IN THE LAB', np.nan)
#         df.to_excel(f'{data_dir}/add_col/{file}', index=False, startrow=1)
#     except:
#         raise
#         print (f'Error: {file}')
#     finally: pass

The follwing sheets have missing values for the `RESULT` column:
- 'COVID-19-Test_result_Reporting_20_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_16_06_2020.xlsx'

Updated column names of some of the Jun files ['GENDER_(M/F)', 'LAB_CONFIRMATION_DATE', 'RESULT']

In [19]:
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     if file.endswith('.xlsx'):
#         try:
#             df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#             df_data = rename_cols_subset_df(df_file, cols_of_interest)
#             df_raw = df_raw.append(df_data)
#             count += 1
#         except: 
#             if KeyError:
# #                 raise
#                 print(f'KeyError: {file}')
#             elif ValueError:
#                 print(f'ValueError: {file}')
#         finally: pass
#     pass
# print(f'Files processed: {count}')
# print(f'Files in the folder: {len(files)}')

### Travel History Missing - the HOSTORY misspelling

- COVID-19-Test_result_Reporting_09_09_2020.xlsx
- COVID-19-Test_result_Reporting_11_09_2020.xlsx
- COVID-19-Test_result_Reporting_12_09_2020.xlsx
- COVID-19-Test_result_Reporting_14_09_2020.xlsx
- COVID-19-Test_result_Reporting_15_09_2020.xlsx
- COVID-19-Test_result_Reporting_18_10_2020.xlsx
- Files processed: 249
- Files in the folder: 256

In [20]:
# keyerrors = []
# # df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, cols_of_interest)
# #         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [21]:
# files = ['COVID-19-Test_result_Reporting_09_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_11_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_12_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_14_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_15_09-2020.xlsx',
#  'COVID-19-Test_result_Reporting_18_10_2020.xlsx']

In [22]:
# for file in files:
#     df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#     (df_file.rename(columns={'HAS TRAVEL HISTORY(LAST 14 DAYS) Y/N': 'HAS TRAVEL HOSTORY(LAST 14 DAYS) Y/N'}, errors='raise').
#     to_excel(f'{data_dir}/rename_col/{file}', index=False, startrow=1))

### Start of find repeat tests

### End of find repeat tests

In [23]:
id_headers = ['SAMPLE_NUMBER', 'NAME', 'AGE',
       'AGE_UNIT', 'GENDER', 'RESULT',
       'LAB_CONFIRMATION_DATE']

In [24]:
warnings.filterwarnings('ignore', module='openpyxl')

In [25]:
df_raw = pd.DataFrame()
count = 0
for file in files:
    if file.endswith('.xlsx'):
        try:
            df_file = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/ResultsDBCurated', recursive=True)[0]}/{file}", header=1)
            if 'VACCINATION STATUS Y/N' not in df_file.columns:
                df_file[['VACCINATION_STATUS_Y/N', 'DOSAGE_C/NC']] = [pd.NA, pd.NA]
            df_data = rename_cols_subset_df(df_file, cols_of_interest)
            df_raw = pd.concat([df_raw,df_data])
            count += 1
        except:
            if KeyError:
                print(f'KeyError: {file}')
                raise
                print(f'KeyError: {file}')
            if ValueError:
                print(f'ValueError: {file}')
#                 raise
        finally: pass
    pass
print(f'Files processed: {count}')
print(f'Files in the folder: {len(files)}')

Files processed: 356
Files in the folder: 357


In [26]:
# df_raw.head()

### **Clean Headers**

In [27]:
cols_rename = ['CASE_ID', 'S_NUM', 'AGE', 'AGE_UNIT', 'GEND', 'OCCU', 'NAT', 'COUNT_RES', 'S_COUNT_RES', 'TRAV_HIST', 
                    'TRAV_FROM', 'QUAR_PLACE', 'SYMPS', 'SAMP_TYPE', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'RESULT', 'DT_CONF', 'VACC_STATUS', 'VACC_DOS']

In [28]:
# df_raw.columns = cols_rename

### **Clean Sample Numbers**

In [29]:
cols_of_interest2 = ['CASE_ID', 'S_NUM', 'NAME', 'AGE', 'AGE_UNIT', 'GEND', 'OCCU', 'NAT', 'COUNT_RES', 'S_COUNT_RES', 'TRAV_HIST', 
                    'TRAV_FROM', 'QUAR_PLACE', 'SYMPS', 'SAMP_TYPE', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'RESULT', 'DT_CONF', 'VACC_STATUS', 'VACC_DOS']

In [30]:
df_raw.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged.xlsx", 
              index=False, header=cols_of_interest2)

In [31]:
df_mrg_mtdata = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged.xlsx", usecols=cols_rename)

### **Clean Sample Name**

In [32]:
df_name = df_mrg_mtdata.assign(S_NUM=df_mrg_mtdata['S_NUM'].apply(lambda x: cov_names(x)).fillna('NA'))

In [33]:
#df_name[df_name['S_NUM'].str.len() < 8]

In [34]:
#df_name[df_name['S_NUM'].str.contains('COVC') == False]

### **Clean Age**

In [35]:
#df_name[df_name['AGE_UNIT'] == 'Days']

In [36]:
df_months = df_name[df_name['AGE_UNIT'] == 'Months']

In [37]:
#df_months.head()

In [38]:
df_mn2yrs = df_months.assign(AGE=df_months['AGE'].map(lambda x: round(x / 12, 1))).replace('Months', 'Years')

In [39]:
#df_mn2yrs.head()

In [40]:
#df_name[df_name['AGE_UNIT'] == 'Days']#.AGE_UNIT.unique()

In [41]:
df_not_months = df_name[df_name['AGE_UNIT'] != 'Months']

In [42]:
df_not_months.shape

(30131, 20)

In [43]:
df_mn2yrs.shape

(78, 20)

In [44]:
df_years = pd.concat([df_not_months, df_mn2yrs])

In [45]:
df_years.shape

(30209, 20)

In [46]:
df_years['AGE_UNIT'].unique()

array(['Years', nan, 'YEARS', 'years', 'Nil', 'Days'], dtype=object)

In [47]:
#df_years[df_years['AGE_UNIT'] == 'NO PATIENT DATASHEET']

In [48]:
#df_years[df_years['AGE_UNIT'].str.contains('M', 'F') == True]

In [49]:
#df_years[df_years['AGE_UNIT'] == 'Nil']

In [50]:
df_years_drop_age_unit = df_years.drop('AGE_UNIT', axis=1)

In [51]:
df_years_rencol = df_years_drop_age_unit.rename(columns={'AGE': 'AGE_YRS'})

In [52]:
#df_years_rencol.head()

In [53]:
df_years_rencol['AGE_YRS'].unique()

array([80, 43, 69, 82, 29, 78, 2, 48, 77, 87, 66, 36, 25, 63, 65, 70, 19,
       22, 23, 81, 42, 68, 54, 73, 28, 13, 72, 76, 89, 75, 85, 60, 37, 11,
       64, 38, 9, 27, 40, 24, 30, 4, 26, 20, 61, 46, 58, 21, 34, 14, 39,
       33, 52, nan, 55, 50, 41, 1, 47, 31, 45, 6, 5, 15, 10, 35, 67, 44,
       17, 51, 57, 49, 32, 16, 59, 56, 18, 79, 83, 12, 74, 53, 71, 86, 62,
       7, 8, 84, 312, 88, 3, 3.6, 1.5, 2.42, 'None indicated', 6.5, 2.5,
       97, 'Nil', 96, 'Not indicated', 743, 'None', 9.5, 'nil', 98,
       'none indicated', 90, 91, 100, 101, 92, 'not_indicated', 5.5,
       'Not Indicated', '37', '36', '40', '38', '29', 94, 105, 0.5,
       'not Indicated', '30', '44', '33', '27', '26', '51', '28',
       'None Indicated', 0.8, 46.7, 0, 0.9, 0.1, 1.6, 1.1, 2.9, 3.7, 0.2,
       1.7, 0.3, 2.8, 0.7, 1.2, 2.1, 0.4, 4.5, 1.8, 0.6], dtype=object)

In [54]:
# [x for x in df_years_rencol['AGE_YRS'].unique() if isinstance(x, str)]

In [55]:
to_rep = ['None indicated',
 'Nil',
 'Not indicated',
 'None',
 'nil',
 'none indicated',
 'not_indicated',
 'Not Indicated',
 'not Indicated',
 'None Indicated',
 'NO PATIENT DATASHEET',
 'ad', 'AD'
]

In [56]:
df_years_repnan2float = df_years_rencol.assign(AGE_YRS=df_years_rencol['AGE_YRS'].replace(to_rep, np.nan).astype(float).fillna(np.nan))

In [57]:
#df_years_repnan2float

### **Clean Gender**

In [58]:
df_years_repnan2float['GEND'].unique()

array(['M', 'F', nan, 'f', 'Female', 'Male', 'F`', 'MALE ', 'MALE', 'Nil',
       'T', 'm', 'FF', 'MM', 'F '], dtype=object)

In [59]:
#df_years_repnan2float[df_years_repnan2float['GEND'].isna()].sort_values('S_NUM')

In [60]:
#df_years_repnan2float[df_years_repnan2float['GEND'] == 'Nil']

COVEs:
- COVC13844
- COVC13845
- COVC13846
- COVC13847
- COVC13848
- COVC13849
- COVC13850
- COVC13851
- COVC13852
- COVC13853
- COVC16256
- COVC16257

In [61]:
df_years_repgender = df_years_repnan2float.assign(GEND=df_years_repnan2float['GEND'].replace(['f', 'Female', 'F`', 'T'], 'F')
                                                 .replace(['Male', 'MALE ', 'MALE', 'm'], 'M'))

In [62]:
df_years_repgender['GEND'].unique()

array(['M', 'F', nan, 'Nil', 'FF', 'MM', 'F '], dtype=object)

In [63]:
df_years_repgender2 = df_years_repgender.assign(GEND=df_years_repgender['GEND'].replace('Nil', pd.NA).fillna('NA'))

In [64]:
# df_years_repgender2['GEND'].unique()

### **Clean Occupation**

In [65]:
# df_years_repgender2['OCCU'].unique()

In [66]:
# df_years_repgender2['OCCU'] = df_years_repgender2['OCCU'].replace(['Nil','nil'], pd.NA)

In [67]:
df_years_repgender2['OCCU'].str.lower().str.strip().nunique()

1359

In [68]:
occu = (df_years_repgender2['OCCU'].str.strip().
     apply(lambda x: str(x).replace(str(x), 'NA') if ('nil' == str(x).lower()) else x).
     replace('nan', 'NA').fillna('NA'))
occu.nunique()

1581

In [69]:
df_occu = df_years_repgender2.assign(OCCU=occu)

### **Clean Nationality**

In [70]:
df_occu['NAT'].unique()

array(['Kenya', 'Zimbabwe', 'Tanzania', 'South Africa',
       'United Kingdom (UK)', 'Botswana',
       'United States of America (USA)', 'Uganda', 'Netherlands',
       'Congo, Democratic Republic of the', 'Burundi', 'Kenyan',
       'South Sudan', nan, 'India', 'Canada', 'France', 'Ethiopia',
       'Somalia', 'Zambia', 'Germany', 'Sudan', 'Lebanon', 'Sweden',
       'Nigeria', 'Benin', 'Cuba', 'Indian', 'Ghana', 'Rwanda',
       'Mauritania', 'Denmark', 'Pakistan', 'Brazil', 'Tanzanian',
       'Italy', 'Australia', 'Colombia', 'Antigua and Barbuda', 'Hungary',
       'Djibouti', 'Finland', 'Senegal', 'kenya', 'Yemen', 'China',
       'Turkey', 'Philippines', 'Japan', 'Ireland', 'Tunisia',
       'Switzerland', 'Togo', 'Cameroon', 'Spain', 'Burkina Faso',
       'Portugal', 'Bangladesh', 'Madagascar', "Cote d'Ivoire", 'Egypt',
       'KENYAN', 'Greece', 'Bulgaria', 'Nil', 'Mozambique', 'Mali',
       'Eritrea', 'Austria', 'Malawi', 'Namibia',
       'Congo, Republic of the', 'Sri L

In [71]:
df_years_repnat = df_occu.assign(NAT=df_occu['NAT'].replace('Nil', 'Unknown')
                                            .replace(['United States', 'Indian'
                                                      , 'Congo, Democratic Republic of the'
                                                     , 'Congo, Republic of the'] 
                                                     , ['United States of America (USA)','India'
                                                        , 'Democratic Republic of the Congo',
                                                       'Republic of the Congo'])
                                            .apply(lambda x: str(x).replace(x, 'Kenya') if ('enya' in str(x).lower()) else x).fillna('NA'))

In [72]:
# df_years_repnat['NAT'].unique()

### **Clean County of Res**

In [73]:
#df_years_repnat.head()

In [74]:
counties = (df_years_repnat['COUNT_RES']
                   .replace(['Nil', 'nan', '', 'Nan', 'Muranga ', 'Murang`a ', 'Niarobi']
                            , [pd.NA, pd.NA, pd.NA, pd.NA, "Murang'a", "Murang'a", 'Nairobi'])
                   .apply(lambda x: str(x).strip().capitalize() if (isinstance(x, str)) else x)).fillna('NA')

In [75]:
df_years_county = (df_years_repnat.assign(COUNT_RES=counties))#.assign(S_COUNT_RES=s_counties)

In [76]:
# df_years_county['COUNT_RES'].unique()

### **Clean Subcounty of Res**

In [77]:
df_years_repnat['S_COUNT_RES'].str.lower().nunique()

1057

In [78]:
len((df_years_repnat['S_COUNT_RES'].fillna('NA')
     .replace(['Nil', 'Nil ', 'Muranga ', 'Murang`a ']
                            , [pd.NA, pd.NA, "Murang'a", "Murang'a"])
                   .apply(lambda x: str(x).strip().capitalize())
     .unique()))

975

In [79]:
s_counties = (df_years_repnat['S_COUNT_RES'].str.strip().str.capitalize().
    replace(['Nil', 'Muranga', 'Murang`a'],
    ['NA', "Murang'a", "Murang'a"]).
    apply(lambda x: str(x).replace(str(x), 'NA') if ('indicate' in str(x)) else x))

In [80]:
# sorted([s for s in s_counties.unique()])

In [81]:
df_years_scounty = df_years_county.assign(S_COUNT_RES=s_counties)

### **Clean Travel History**

In [82]:
#df_years_scounty.head()

In [83]:
df_years_scounty['TRAV_HIST'].unique()

array(['No', 'Yes', nan, 'Y', 'N', 'JH', 'no', 'NO ', 'Nil', 'YES',
       'NONE ', 'NONE', 'y', 'yes', 15.93], dtype=object)

In [84]:
df_travl = df_years_scounty.assign(TRAV_HIST=df_years_scounty['TRAV_HIST'].replace([pd.NA, 'N', 'no', 'NO', 'Nil', 'NONE ', 'NONE'], 'No')
           .replace(['YES', 'Y', 'y', 'yes'], 'Yes'))


In [85]:
df_travl['TRAV_FROM'].unique()

array([nan, 'Nairobi', 'South Africa', 'Eldoret', 'Busia',
       'Kisumu;Garissa', 'Kisii', 'Kitale', 'Homabay', 'Eldama-Ravine',
       'KITUI;NAIROBI', 'KISUMU', 'UGANDA', 'KISUMU;KITUI', 'NAIROBI',
       'NAKURU;TURKANA', 'CANADA', 'FRANCE', 'SAUDI ARABIA',
       'ITEN-KENDUR', 'BUNGOMA', 'EMBU', 'cuba', 'ELDORET', 'NANDI HILLS',
       'KERICHO', 'KAKAMEGA', 'KAPENGURIA', 'NAIROBI;NAKURU;ELDORET',
       'BUSIA;NAKURU', 'SIAYA', 'TANZANIA', 'Nyamira',
       'Kendu-Bay;Kisii;Kisumu;Homabay;Oyugis', 'Mumias, Oyugis, Homabay',
       'NAIVASHA', 'NAIROBI;NYERI;NANYUKI', 'NIL', 'Ruaka', 'Ngoingwa',
       'Limuru', 'Thika', 'Embakasi', 'Uganda', 'Dubai', 'Nil',
       'Not indicated on form ', 'TORORO UGANDA', 'AWASI', 'KIGALI',
       'ENTEBE', 'JINJA', 'KAMPALA', 'KASESE', 'KILIFI COUNTY',
       'KWALE COUNTY', 'Mwanza', 'Mombasa', 'Geita', 'Isebania',
       'Kilimanjaro', 'Migori', 'KENDU BAY', 'KISUMU, KITALE', 'HOMABAY',
       'KISII, KISUMU', 'HOMABAY, KISII', 'NYAHURURU; 

In [86]:
df_travl2 = (df_travl.assign(TRAV_FROM=df_travl['TRAV_FROM'].str.strip().replace(['nan','NIL', 'Nil', 'Not indicated on form '
                  , 'NOT APPLICABLE ', 'N', 'N', 'No', 'not_indicated', 'Not applicable', 'Not indicated'],pd.NA)
                    .replace(['Mombasa (stayed in mtwapa for 4 days)', 'KINANGO MSAMBWENI', 'Nan'
                              , 'Mombasa (Stayed in Mtwapa for 4 days)'], 
                    ['Mtwapa', 'Kinangop;Msambweni', pd.NA, 'Mtwapa'])
                    .apply(lambda x: str(x).replace('/', ';')
                    .replace(' and ', ';')
                    .replace('-', ';')
                    .replace(',', ';')
                    .replace(' ; ', ';')
                    .replace('; ', ';')
                    .strip().capitalize() if (isinstance(x, str)) else x)).fillna('NA'))

In [87]:
# df_travl2['TRAV_FROM'].unique()



In [88]:
df_travl2.head(1)

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,SAMP_TYPE,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,VACC_STATUS,VACC_DOS
0,SIAYA/COV1911621/2021,COVM02985,80.0,M,,Kenya,Siaya,,No,,,CO,NP Swab,18/08/2021,09/03/2022,,2022-03-16 00:00:00,N,NC


### **Clean Quarentine Place**

In [89]:
df_travl2['QUAR_PLACE'].str.lower().str.strip().nunique()

150

In [90]:
quar_pl = (df_travl2['QUAR_PLACE'].str.strip().
     apply(lambda x: str(x).replace(str(x), 'NA') if ('nil' == str(x).lower()) else x).
     apply(lambda x: str(x).replace(str(x), 'NA') if ('indicate' in str(x).lower()) else x).
     apply(lambda x: str(x).replace(str(x), 'NA') if ('applic' in str(x).lower()) else x).
     apply(lambda x: str(x).replace(str(x), 'NA') if ('n/a' in str(x).lower()) else x).      
     fillna('NA'))
quar_pl.nunique()

154

In [91]:
df_quar = df_travl2.assign(QUAR_PLACE=quar_pl)

### **Clean Symptoms**

Symptoms legend
- GW: General (body) weakness/fatigue/malaise
- FC: Fever/Chills
- CO: Cough
- ST: Sore throat
- RN: Runny nose
- SB: Shortness of breath/Difficulty in breathing
- D: Diarrhoea
- NV: Nausea/Vomiting
- H: Headache
- IC: Irritability/Confusion
- P: Pain
- LS: Loss of smell
- LT: Loss of taste/appetite
- P-M: Muscular pain
- P-A: Abdominal pain
- P-B: Back pain
- P-C: Chest pain
- P-J: Joint pain
- BA: Body aches
- SZ: Sneezing
- HP: Hypertension
- TN: Tonsillitis
- DZ: Dizziness
- NS: Night sweats
- SG: Swollen gland

**Symptoms table**

In [92]:
abbr = ['GW','FC','CO','ST','RN','SB','D','NV','H',
        'IC','P','LS','LT','P-M','P-A','P-B','P-C',
        'P-J','BA','SZ','HP','TN','DZ','NS', 'SG']
symp = ['General (body) weakness/fatigue/malaise','Fever/Chills',
        'Cough','Sore throat','Runny nose','Shortness of breath/Difficulty in breathing',
        'Diarrhoea','Nausea/Vomiting','Headache','Irritability/Confusion','Pain','Loss of smell',
        'Loss of taste/appetite','Muscular pain','Abdominal pain','Back pain','Chest pain',
        'Joint pain','Body aches','Sneezing','Hypertension','Tonsillitis','Dizziness','Night sweats', 'Swollen glands']
symp_header = pd.MultiIndex.from_arrays([['SYMP_CODE', 'SC_MEANING'], 
             ['Symptom code', 'Code meaning']], names=['abbr', 'desc'])
symptoms = pd.DataFrame({'a': abbr, 'b': symp})
symptoms.columns=symp_header
symptoms.head()

abbr,SYMP_CODE,SC_MEANING
desc,Symptom code,Code meaning
0,GW,General (body) weakness/fatigue/malaise
1,FC,Fever/Chills
2,CO,Cough
3,ST,Sore throat
4,RN,Runny nose


In [93]:
df_quar['SYMPS'] = (df_quar['SYMPS'].apply(lambda x: str(x).replace(str(x), 'NA') if ('symtom' in str(x)) else str(x).replace('F/C', 'FC'))
                      .replace(['[=]','nan'], 'NA'))
pd.Series(df_quar['SYMPS']).to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-symptoms.xlsx", index=False)
df_symp = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-symptoms_curation.xlsx")
symp_dict = {str(k): str(v) for k,v in zip(df_symp['STRING'], df_symp['REPLACE'])}

symp_cln = (df_quar['SYMPS'].replace((symp_dict)).replace(['nan'], ['NA']).str.strip().
replace(['; ', ',', '.'], ';').replace(';;', ';').apply(lambda x: ';'.join(sorted(x.split(';')))))

df_symps = df_quar.assign(SYMPS=symp_cln.fillna('NA'))

### **Clean Sample Type**

In [94]:
df_symps.SAMP_TYPE.unique()
df_samp_ty = df_symps.assign(SAMP_TYPE=df_symps['SAMP_TYPE'].replace(['NP and OP', 'NP-OP +AB2:AL2Swab',
       'NP-OP Swab', 'nP Swab', 'NP OP Swab', 'NP&OP'], ['NP & OP Swab', 'NP & OP Swab', 'NP & OP Swab', 'NP Swab', 'NP & OP Swab', 'NP & OP Swab']).fillna('NA'))

### **Clean Dates**

In [95]:
df_dt1 = df_samp_ty.assign(DT_SAM_COLL=df_samp_ty['DT_SAM_COLL'].apply(lambda x: my_date_parser(x)))

In [96]:
df_dt2 = df_dt1.assign(DT_SAM_RECEP=df_dt1['DT_SAM_RECEP'].apply(lambda x: my_date_parser(x)))

In [97]:
df_dts_cln = df_dt2.assign(DT_CONF=df_dt2['DT_CONF'].apply(lambda x: my_date_parser(x)))

In [98]:
#df_dts_cln.head()

### **Clean Results**

In [99]:
df_dts_cln['RESULT'].unique()

array(['NA', 'Positive', 'Ag Positive', 'POSITIVE', 'Negative',
       'positive', 'neg', 'pos', 'ms', 'negative', 'Neg', 'Pos', 'MS',
       'Positve', 'Positive ', 'M2000'], dtype=object)

In [100]:
df_dts_cln['RESULT'].map(lambda x: x.strip().lower() if (isinstance(x, str)) else x).unique()

array(['na', 'positive', 'ag positive', 'negative', 'neg', 'pos', 'ms',
       'positve', 'm2000'], dtype=object)

In [101]:
df_dts_cln['RESULT'] = df_dts_cln['RESULT'].map(lambda x: cln_results(x)).fillna('NA').replace('na', 'NA')

In [102]:
df_dts_cln['RESULT'].unique()

array(['NA', 'Positive', 'Negative', 'ms', 'm2000'], dtype=object)

In [103]:
df_dts_cln[df_dts_cln.duplicated(keep=False) == True].sort_values('S_NUM')

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,SAMP_TYPE,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,VACC_STATUS,VACC_DOS
13132,,COVC00708,35.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,NP & OP Swab,2020-06-09,2020-06-11,Negative,2020-06-12,,
11500,,COVC00708,35.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,NP & OP Swab,2020-06-09,2020-06-11,Negative,2020-06-12,,
11501,,COVC00709,32.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,NP & OP Swab,2020-06-09,2020-06-11,Negative,2020-06-12,,
13133,,COVC00709,32.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,NP & OP Swab,2020-06-09,2020-06-11,Negative,2020-06-12,,
11502,,COVC00710,45.0,M,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,NP & OP Swab,2020-06-09,2020-06-11,Negative,2020-06-12,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29800,NPHL04/35613/2021,COVM01632,32.0,F,Consultant,Kenya,Nairobi,,Yes,,,,NP & OP Swab,2021-11-29,2021-12-01,,2021-12-01,Y,C
29801,NPHL04/35614/2021,COVM01633,36.0,M,Engineer,South Africa,Nairobi,,Yes,,,,NP & OP Swab,2021-11-29,2021-12-01,,2021-12-01,Y,C
473,NPHL04/35614/2021,COVM01633,36.0,M,Engineer,South Africa,Nairobi,,Yes,,,,NP & OP Swab,2021-11-29,2021-12-01,,2021-12-01,Y,C
474,NPHL04/JKIA35615-0843/2021,COVM01634,33.0,M,,Kenya,Nairobi,,Yes,,,,NP & OP Swab,2021-11-29,2021-12-01,,2021-12-01,,


### **Remove/QC Duplicates**

In [104]:
df_cln_srt = df_dts_cln.sort_values(['S_NUM', 'DT_CONF'])

In [105]:
#df_cln_srt[df_cln_srt.duplicated(['S_NUM', 'DT_CONF'], keep=False) == True]

In [106]:
#df_cln_srt[df_cln_srt.duplicated(keep=False) == True]

In [107]:
df_dedud0 = df_cln_srt.drop_duplicates(['S_NUM'], keep='last')#['COUNT_RES'].fillna('Unknown', inplace=True)
df_dedud = df_dedud0.assign(CASE_ID=df_dedud0.CASE_ID.apply(lambda x: str(x)))

In [108]:
mask1 = df_dedud.duplicated(['CASE_ID'], keep=False)
mask2 = df_dedud.CASE_ID != 'NA'
df_dedud.sort_values(['DT_CONF','CASE_ID'])[mask1 == True & mask2].tail(10)

  df_dedud.sort_values(['DT_CONF','CASE_ID'])[mask1 == True & mask2].tail(10)


Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,SAMP_TYPE,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,VACC_STATUS,VACC_DOS
172,MCRH/MIG19579/2021,COVM02366,27.0,F,CHW,Kenya,Migori,,No,,,,NP Swab,NaT,NaT,Positive,2022-01-05,Y,
541,BCRH/16/2021,COVM02669,16.0,M,,Kenya,Bungoma,,No,,,,NP & OP Swab,2021-12-30,2022-01-13,Positive,2022-01-18,N,
28303,ETH/S03271221059/2021,COVM02887,31.0,F,,Ethiopia,Addis ababa,Bole,No,,,,OP Swab,2021-12-27,2022-03-08,Positive,2022-03-09,,
28304,ETH/S03271221059/2021,COVM02888,39.0,M,,Ethiopia,Addis ababa,Gulele,No,,,,OP Swab,2021-12-27,2022-03-08,Positive,2022-03-09,,
326,MCRH/MIG19736/2021,COVM02520,37.0,F,CHW,Kenya,Migori,Suna west,No,,Homestead,,NP & OP Swab,2021-12-27,NaT,Positive,NaT,Y,C
364,MCRH/MIG19736/2021,COVM02558,37.0,F,CHW,Kenya,Migori,Suna west,No,,Homestead,,NP & OP Swab,2021-12-27,NaT,Positive,NaT,Y,C
29521,NPHL03/KPC7/32,COVM01152,,,,,Nairobi,,No,,,,NP & OP Swab,NaT,2021-09-15,Positive,NaT,,
29529,NPHL03/KPC7/32,COVM01160,,,,,Nairobi,,No,,,,NP & OP Swab,NaT,2021-09-15,Positive,NaT,,
30044,SWZ/23287/2022,COVM03273,28.0,M,,Eswatini,Hhohho,Mbabane,No,,,,NP Swab,2022-04-29,2022-05-18,,NaT,,
30053,SWZ/23287/2022,COVM03282,27.0,M,,Eswatini,Hhohho,Mbabane,No,,,,NP Swab,2022-04-29,2022-05-18,,NaT,,


In [109]:
df_dedud[df_dedud.duplicated(['S_NUM'], keep=False) == True]

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,SAMP_TYPE,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,VACC_STATUS,VACC_DOS


### **QC Dates**

In [110]:
sr_date_diff = df_dedud['DT_CONF'] - df_dedud['DT_SAM_COLL']

In [111]:
mask3 = sr_date_diff >= '60D'
mask4 = sr_date_diff != pd.NaT
mask5 = df_dedud['DT_SAM_COLL'] > df_dedud['DT_CONF']
mask6 = df_dedud['DT_SAM_COLL'] > dt
mask7 = df_dedud['DT_CONF'] > dt
# mask3 = df_dedud['DT_CONF']

  return self._cmp_method(other, operator.gt)


In [112]:
df_dedud[mask5 | mask6 | mask7 == True]

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,SAMP_TYPE,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,VACC_STATUS,VACC_DOS
28566,KEM-21-02-93781,COVM00193,46.0,M,,Kenya,Nairobi,Roysambu,No,,,,NP & OP Swab,2021-02-26,2021-07-30,Positive,2021-02-03,,
28569,KEM-21-02-93550,COVM00196,14.0,F,,Kenya,Kiambu,Limuru,No,,,CO;GW;H,NP & OP Swab,2021-02-23,2021-07-30,Positive,2021-02-03,,
28602,KEM-21-03-93782,COVM00229,28.0,M,,Kenya,Makueni,Kilome,Yes,,,GW;LS;LT,NP & OP Swab,2021-02-25,2021-07-30,Positive,2021-02-03,,
28603,KEM-21-02-93544,COVM00230,15.0,F,,Kenya,Kiambu,Limuru,No,,,,NP & OP Swab,2021-02-23,2021-07-30,Positive,2021-02-03,,
28627,KEM-21-02-93645,COVM00254,37.0,F,,Kenya,Kiambu,Githunguri,No,,,CO;FC;LS;LT;P-C;RN;ST,NP & OP Swab,2021-02-25,2021-07-30,Positive,2021-02-03,,
28644,KEM-21-02-92432,COVM00271,22.0,M,,Kenya,Murang'a,Kangema,No,,,,NP & OP Swab,2021-11-02,2021-07-30,Positive,2021-02-16,,
545,BCRH/20/2022,COVM02673,62.0,M,,Kenya,Bungoma,Bumula,No,,,CO;GW;H;P-J;SB,NP & OP Swab,2022-10-03,2022-01-13,Positive,2022-01-18,Y,
560,BCRH/35/2022,COVM02688,24.0,F,Education Academic,Kenya,Bungoma,Bungoma central,Yes,Homabay,,CO;FC;H;P-C;ST,NP & OP Swab,2022-12-06,2022-01-13,Positive,2022-01-18,Y,
561,BCRH/36/2022,COVM02689,23.0,F,Education Academic,Kenya,Bungoma,Bungoma central,Yes,Eldama;ravine,,,NP & OP Swab,2022-12-06,2022-01-13,Positive,2022-01-18,Y,
29816,SDN/NP4087/2021,COVM03045,45.0,F,,Sudan,Khartoum,,No,,,,NP Swab,2121-10-10,2022-04-07,Positive,NaT,,


### **Export Cleaned Data**

In [113]:
header_des1 = pd.MultiIndex.from_arrays([['CASE_ID', 'S_NUM', 'AGE_YRS', 
               'GEND', 'OCCU', 'NAT', 'COUNT_RES', 'S_COUNT_RES', 'TRAV_HIST', 
               'TRAV_FROM', 'QUAR_PLACE', 'SYMPS', 'SAMP_TYPE', 'DT_SAM_COLL', 
               'DT_SAM_RECEP', 'RESULT', 'DT_CONF', 'VACC_STATUS', 'VACC_DOS'], 
               ['Source sample ID', 'Unique lab ID', 'Age in years', 'Gender', 
               'Occupation', 'Nationality', 'County of residence', 'Sub-county of residence', 
               'Travel history', 'Place travelled', 'Place of quarantine', 
               'Symptoms', 'Sample type', 'Date of sample collection', 
               'Date of sample lab reception', 'RT-PCR results', 'Date of RT-PCR', 
               'Vaccination status', 'Vaccine dosage']], names=('abbr', 'desc'))

In [114]:
df_dedud.columns = header_des1

In [115]:
df_dedud.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-cln.xlsx", index=True)

In [116]:
df_dedud[df_dedud['RESULT'] == 'Positive'].to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-cln-pos.xlsx", header=header_des1, index=True)

In [117]:
#df_dedud[df_dedud['S_NUM'].str.contains('COVC') == False]

In [118]:
#df_dedud[df_dedud['S_NUM'].str.contains('COVC') == False]

df_symps11-09-2020

COVC9774
COVC9775
COVE20
COVE21
COVC9776
COVC9777


### **ID Misssing Data**

In [119]:
start = 0
start2 = 0
df_missing = pd.DataFrame(columns=cols_rename)
df_missing2 = pd.DataFrame(columns=cols_rename)
for num in df_dedud[('S_NUM','Unique lab ID') ].sort_values():
#     curr_row = df_dts_cln[df_dts_cln['S_NUM']]
    try:
        curr = int(num.lstrip('COVC').lstrip('0'))
        curr2 = int(num.lstrip('COVM').lstrip('0'))
        
        if curr - start > 1:
            start_id = f'COVC{start}'
            start_id2 = f'COVM{start}'
            cov_id = f'COVC{curr}'
            cov_id2 = f'COVM{curr2}'
            df_missing = pd.concat([df_missing, df_dts_cln[df_dts_cln['S_NUM'] == cov_names(start_id)]])
            df_missing2 = pd.concat([df_missing, df_dts_cln[df_dts_cln['S_NUM'] == start_id2]])
            df_missing = pd.concat([df_missing, df_dts_cln[df_dts_cln['S_NUM'] == cov_names(cov_id)]])
            df_missing2 = pd.concat([df_missing, df_dts_cln[df_dts_cln['S_NUM'] == cov_id2]])
        start = curr
        start2 = curr2
    except ValueError:
#         print(f'ValueError: {num}')
        pass

In [120]:
#df_missing.head()

In [121]:
(df_missing[['S_NUM', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'DT_CONF']]
 .sort_values(['S_NUM', 'DT_CONF'])
 .drop_duplicates('S_NUM', keep='last')
 .to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-missing.xlsx", index=False))

### **ID Missing Results**

In [122]:
(df_dts_cln[df_dts_cln['RESULT'].isin(['Positive', 'Negative']) == 
            False][['S_NUM', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'DT_CONF', 'RESULT']]
.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-results-missing.xlsx", index=False))

In [123]:
# plt.style.available

### **Metadata + Cts**

In [124]:
df_Cts = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/runsheet-cts.xlsx")[['Sample_Name', 'Well_Position', 'Ct_Mean', 'Dt_Run']]

In [125]:
header_des2 = pd.MultiIndex.from_arrays([['S_NUM', 'RTPCR_POS', 'AVG_Ct', 'DT_RUN'], 
               ['Unique lab ID', 'RT-PCR Well position', 'Detection genes average Ct', 'RT-PCR run date']])

In [126]:
df_Cts.columns = header_des2

In [127]:
df_metCts = df_dedud.set_index(('S_NUM', 'Unique lab ID')).merge(df_Cts.set_index(('S_NUM', 'Unique lab ID')), how='left', left_index=True, right_index=True).reset_index().fillna('NA')#.rename(columns={'index': 'S_NUM', 'Ct_Mean': 'AVG_Ct'})

In [128]:
writer = pd.ExcelWriter(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-resultsCts-merged-cln.xlsx")
df_metCts.to_excel(writer, sheet_name='Metadata', index=True, na_rep='NA', float_format='%.1f')
symptoms.to_excel(writer, sheet_name='symptomsDictionary', index=True, na_rep='NA', float_format='%.1f')
writer.close()

In [129]:
df_metCts.shape

(29809, 22)

In [130]:
df_metCts[df_metCts['RESULT'] == 'Positive'].to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-resultsCts-merged-cln-pos.xlsx", index=True)

#### **Confirm updated file in the following cell**

In [131]:
df_sequenced_samples = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-sequenced-samples-IDs_20-04-2022.xlsx")
seqd_list = list(df_sequenced_samples['SAMPLE'])

In [132]:
df_dedud_pos = df_metCts[df_metCts['RESULT'] == 'Positive']
df_dedud_pos_unseq = df_dedud_pos[df_dedud_pos['S_NUM'].isin(seqd_list) == False]
df_dedud_pos_unseq.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-unsequenced-pos-samples_{dt}.xlsx", index=True)

In [133]:
#df_dedud_pos_unseq

In [134]:
df_dedud_pos = df_metCts[df_metCts['RESULT'] == 'Positive']
df_dedud_pos_seq = df_dedud_pos[df_dedud_pos['S_NUM'].isin(seqd_list) == True]
df_dedud_pos_seq.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-sequenced-pos-samples_{dt}.xlsx", index=True)

### Positives 2021 - Unsequenced

In [135]:
df_dedud_pos_unseq_21 = df_dedud_pos_unseq[df_dedud_pos_unseq['DT_CONF'] >= pd.to_datetime('2021-01-01 00:00:00')]
df_dedud_pos_unseq_21.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-unsequenced-pos-samples-2021_{dt}.xlsx", index=True)