$15^{th}$ Jul 2021

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import glob, os, re
import matplotlib.pyplot as plt
from datetime import  datetime
from ipywidgets import widgets, interactive

In [2]:
dt = datetime.today().strftime(format='%d-%m-%Y')

### *Functions*

In [3]:
def replace(string, substitutions):

    substrings = sorted(substitutions, key=len, reverse=True)
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

### **Establish parent directory**

In [4]:
#uniqueness in directory and file names is assumed for all analyses
sars_dir = "SARS-CoV-2"
home_dir = os.getenv('HOME')
parent_dir = glob.glob(f'{home_dir}/**/{sars_dir}', recursive=True)[0]

In [5]:
cols_of_interest = ['CASE_ID','SAMPLE_NUMBER', 'NAME', 'AGE', 'AGE_UNIT', 'GENDER', 'OCCUPATION', 'NATIONALITY', 
           'COUNTY_OF_RESIDENCE', 'SUB_COUNTY_OF_RESIDENCE', 'HAS_TRAVEL_HOSTORY', 'TRAVEL_FROM', 
           'QUARANTINE_FACILITY/HOSPITAL/HOMESTEAD', 'SYMTOMS_SHOWN', 'DATE_OF_SAMPLE_COLLECTION', 
           'DATE_SAMPLE_RECEIVED_IN_THE_LAB', 'RESULT', 'LAB_CONFIRMATION_DATE']

In [6]:
files = os.listdir(glob.glob(f'{parent_dir}/**/ResultsDBCurated', recursive=True)[0])
files = sorted(files)

In [7]:
def rename_cols_subset_df(df, cols):
    df_new = df.copy()
    df_new.columns = list(map(lambda x: x.split('(')[0].strip(' _'), [col.replace(' ', '_').replace('\n', '_').upper() for col in df.columns]))
    df_cln = df_new[df_new['SAMPLE_NUMBER'].notna()]
    return df_cln[cols]

In [8]:
# keyerrors = []
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, 'DATE_SAMPLE_RECEIVED_IN_THE_LAB')
# #         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [9]:
# files = ['COVID-19-Test_result_Reporting_01_07_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_01_07_2020-B.xlsx',
#  'COVID-19-Test_result_Reporting_05_09_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_05_09_2020-B.xlsx',
#  'COVID-19-Test_result_Reporting_08_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_15_07_2020.xlsx',
#  'COVID-19-Test_result_Reporting_18_06_2020.xlsx',
#  'COVID-19-Test_result_Reporting_31_07_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_31_07_2020-B.xlsx']

In [10]:
# for file in files:
#     df = pd.read_excel(f'{data_dir}/{file}', header=1)
#     try:
#         df.insert(27, 'DATE SAMPLE RECEIVED IN THE LAB', pd.NaT)
#         df.to_excel(f'/home/douso/Documents/TrendData/Results/add_col/{file}', index=False, startrow=1)
#     except:
#         raise
#         print (f'Error: {file}')
#     finally: pass

In [11]:
# keyerrors = []
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, cols_of_interest)
#         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [12]:
headers = ['TESTING LAB', 'CASE ID', 'TYPE OF CASE',
       'SAMPLE NUMBER', 'NAME', 'ID/PASSPORT NUMBER', 'AGE',
       'AGE UNIT', 'GENDER', 'PHONE NUMBER',
       'OCCUPATION', 'NATIONALITY', 'COUNTY OF RESIDENCE',
       'SUB COUNTY OF RESIDENCE', 'VILLAGE/ESTATE OF RESIDENCE', 'WARD',
       'COUNTY OF DIAGNOSIS', 'HAS TRAVEL HOSTORY',
       'TRAVEL FROM', 'CONTACT WITH CASE Y/N', 'CONFIRMED CASE NAME',
       'QUARANTINE FACILITY/HOSPITAL/HOMESTEAD', 'HAVE SYMPTOMS Y/N',
       'DATE OF ONSET OF SYMPTOMS', 'SYMTOMS SHOWN',
       'SAMPLE TYPE',
       'DATE OF SAMPLE COLLECTION', 'DATE SAMPLE RECEIVED IN THE LAB',
       'RESULT', 'LAB CONFIRMATION DATE']

In [13]:
id_headers = ['SAMPLE_NUMBER', 'NAME', 'ID/PASSPORT_NUMBER', 'AGE',
       'AGE_UNIT', 'GENDER', 'PHONE_NUMBER', 'RESULT',
       'LAB_CONFIRMATION_DATE']

In [14]:
# for file in keyerrors[:-1]:
#     df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#     df_file.columns = headers
#     df_file.to_excel(f'{data_dir}/add_col2/{file}', startrow=1)

The following 60 sheets, besides some June sheets (manually fixed earlier), lacked the `DATE_SAMPLE_RECEIVED_IN_THE_LAB` column, it was added for uniformity:

- 'COVID-19-Test_result_Reporting_01_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_01_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_02_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_06_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_06_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_08_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_08_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_09_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_09_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_10_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_11_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_12_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_13_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_13_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_14_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_14_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_15_08_2020-A.xlsx'
- 'COVID-19-Test_result_Reporting_15_08_2020-B.xlsx'
- 'COVID-19-Test_result_Reporting_16_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_16_10_2020 (1).xlsx'
- 'COVID-19-Test_result_Reporting_17_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_17_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_18_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_18_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_19_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_19_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_20_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_20_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_22_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_23_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_08_2020-A.xlsx'
- 'COVID-19-Test_result_Reporting_24_08_2020-B.xlsx'
- 'COVID-19-Test_result_Reporting_25_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_25_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_08_2020.xlsx'

In [15]:
# for file in files:
#     df = pd.read_excel(f'{data_dir}/{file}', header=1)
#     try:
#         df.insert(27, 'DATE SAMPLE RECEIVED IN THE LAB', np.nan)
#         df.to_excel(f'{data_dir}/add_col/{file}', index=False, startrow=1)
#     except:
#         raise
#         print (f'Error: {file}')
#     finally: pass

The follwing sheets have missing values for the `RESULT` column:
- 'COVID-19-Test_result_Reporting_20_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_16_06_2020.xlsx'

Updated column names of some of the Jun files ['GENDER_(M/F)', 'LAB_CONFIRMATION_DATE', 'RESULT']

In [16]:
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     if file.endswith('.xlsx'):
#         try:
#             df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#             df_data = rename_cols_subset_df(df_file, cols_of_interest)
#             df_raw = df_raw.append(df_data)
#             count += 1
#         except: 
#             if KeyError:
# #                 raise
#                 print(f'KeyError: {file}')
#             elif ValueError:
#                 print(f'ValueError: {file}')
#         finally: pass
#     pass
# print(f'Files processed: {count}')
# print(f'Files in the folder: {len(files)}')

### Travel History Missing - the HOSTORY misspelling

- COVID-19-Test_result_Reporting_09_09_2020.xlsx
- COVID-19-Test_result_Reporting_11_09_2020.xlsx
- COVID-19-Test_result_Reporting_12_09_2020.xlsx
- COVID-19-Test_result_Reporting_14_09_2020.xlsx
- COVID-19-Test_result_Reporting_15_09_2020.xlsx
- COVID-19-Test_result_Reporting_18_10_2020.xlsx
- Files processed: 249
- Files in the folder: 256

In [17]:
# keyerrors = []
# # df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, cols_of_interest)
# #         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [18]:
# files = ['COVID-19-Test_result_Reporting_09_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_11_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_12_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_14_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_15_09-2020.xlsx',
#  'COVID-19-Test_result_Reporting_18_10_2020.xlsx']

In [19]:
# for file in files:
#     df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#     (df_file.rename(columns={'HAS TRAVEL HISTORY(LAST 14 DAYS) Y/N': 'HAS TRAVEL HOSTORY(LAST 14 DAYS) Y/N'}, errors='raise').
#     to_excel(f'{data_dir}/rename_col/{file}', index=False, startrow=1))

### Start of find repeat tests

### End of find repeat tests

In [20]:
id_headers = ['SAMPLE_NUMBER', 'NAME', 'AGE',
       'AGE_UNIT', 'GENDER', 'RESULT',
       'LAB_CONFIRMATION_DATE']

In [21]:
df_raw = pd.DataFrame()
count = 0
for file in files:
    if file.endswith('.xlsx'):
        try:
            df_file = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/ResultsDBCurated', recursive=True)[0]}/{file}", header=1)
            df_data = rename_cols_subset_df(df_file, cols_of_interest)
            df_raw = df_raw.append(df_data)
            count += 1
        except:
            if KeyError:
                print(f'KeyError: {file}')
#                 raise
                print(f'KeyError: {file}')
            if ValueError:
                print(f'ValueError: {file}')
#                 raise
        finally: pass
    pass
print(f'Files processed: {count}')
print(f'Files in the folder: {len(files)}')

  warn(msg)


KeyError: COVID-19-Test_result_Reporting_15_06_2020.xlsx
KeyError: COVID-19-Test_result_Reporting_15_06_2020.xlsx
ValueError: COVID-19-Test_result_Reporting_15_06_2020.xlsx
Files processed: 310
Files in the folder: 311


In [22]:
# df_raw.head()

### **Clean Headers**

In [23]:
cols_rename = ['CASE_ID', 'S_NUM', 'AGE', 'AGE_UNIT', 'GEND', 'OCCU', 'NAT', 'COUNT_RES', 'S_COUNT_RES', 'TRAV_HIST', 
                    'TRAV_FROM', 'QUAR_PLACE', 'SYMPS', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'RESULT', 'DT_CONF']

In [24]:
# df_raw.columns = cols_rename

### **Clean Sample Numbers**

In [25]:
def cov_names(x):
    x = x.upper().replace(' ', '')
    if len(x) == 8:
        x = x.replace('COVC', 'COVC0')
    elif len(x) == 7:
        x = x.replace('COVC', 'COVC00')
    elif len(x) == 6:
        x = x.replace('COVC', 'COVC000')
    elif len(x) == 5:
        x = x.replace('COVC', 'COVC0000')
    return x

In [26]:
cols_of_interest2 = ['CASE_ID', 'S_NUM', 'NAME', 'AGE', 'AGE_UNIT', 'GEND', 'OCCU', 'NAT', 'COUNT_RES', 'S_COUNT_RES', 'TRAV_HIST', 
                    'TRAV_FROM', 'QUAR_PLACE', 'SYMPS', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'RESULT', 'DT_CONF']

In [27]:
df_raw.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged.xlsx", 
              index=False, header=cols_of_interest2)

In [28]:
df_mrg_mtdata = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged.xlsx", usecols=cols_rename)

### **Clean Sample Name**

In [29]:
df_name = df_mrg_mtdata.assign(S_NUM=df_mrg_mtdata['S_NUM'].apply(lambda x: cov_names(x)))

In [30]:
df_name[df_name['S_NUM'].str.len() < 8]

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
9936,COVE20,COVE20,39.0,Years,M,SECURITY GUARD,Kenya,Machakos,ATHI RIVER,No,,,,2020-09-10 00:00:00,2020-09-10 00:00:00,Negative,2020-09-11 00:00:00
9937,COVE21,COVE21,50.0,Years,M,SECURITY GUARD,Kenya,Machakos,ATHI RIVER,No,,,,2020-09-10 00:00:00,2020-09-10 00:00:00,Negative,2020-09-11 00:00:00
18352,11,NIL,40.0,Years,M,Nil,Kenya,Laikipia,Laikipia East,No,,NTRH,,2020-06-18 00:00:00,,Negative,2020-06-20 00:00:00
26726,KEMRI-1,KEMRI-1,,Years,,,Kenya,,,No,,,,,,Positive,19-04-2021
26730,KEMRI-2,KEMRI-2,,Years,,,Kenya,,,No,,,,,,Positive,19-04-2021
26731,KEMRI-3,KEMRI-3,,Years,,,Kenya,,,No,,,,,,Positive,19-04-2021
26732,KEMRI-4,KEMRI-4,,Years,,,Kenya,,,No,,,,,,Positive,19-04-2021
26733,KEMRI-5,KEMRI-5,,Years,,,Kenya,,,No,,,,,,Positive,19-04-2021
26734,KEMRI-6,KEMRI-6,,Years,,,Kenya,,,No,,,,,,Positive,19-04-2021
26735,KEMRI-7,KEMRI-7,,Years,,,Kenya,,,No,,,,,,Positive,19-04-2021


In [31]:
df_name[df_name['S_NUM'].str.contains('COVC') == False]

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
9936,COVE20,COVE20,39,Years,M,SECURITY GUARD,Kenya,Machakos,ATHI RIVER,No,,,,2020-09-10 00:00:00,2020-09-10 00:00:00,Negative,2020-09-11 00:00:00
9937,COVE21,COVE21,50,Years,M,SECURITY GUARD,Kenya,Machakos,ATHI RIVER,No,,,,2020-09-10 00:00:00,2020-09-10 00:00:00,Negative,2020-09-11 00:00:00
18352,11,NIL,40,Years,M,Nil,Kenya,Laikipia,Laikipia East,No,,NTRH,,2020-06-18 00:00:00,,Negative,2020-06-20 00:00:00
26142,723,DRC-BKV-723,56,Years,M,,Democratic Republic of the Congo,Bukavu,,,,,CO;RN;LT;GW,,02-08-2021,Positive,
26143,212,DRC-BKV-212,,Years,,,Democratic Republic of the Congo,Bukavu,,,,,,,02-08-2021,Positive,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27766,NPHL04/JKIA35537/2021,COVM1625,33,Years,F,Entertainer,Kenya,Nairobi,,No,,,,2021-11-28 00:00:00,2021-11-29 00:00:00,,2021-11-29 00:00:00
27767,NPHL04/JKIA35538/2021,COVM1626,36,Years,M,Consultant,United Kingdom (UK),Nairobi,,No,,,,2021-11-28 00:00:00,2021-11-29 00:00:00,,2021-11-29 00:00:00
27768,NPHL04/JKIA35539/2021,COVM1627,29,Years,M,,South Africa,Nairobi,,No,,,,,2021-11-29 00:00:00,,2021-11-29 00:00:00
27769,NPHL04/JKIA35540/2021,COVM1628,52,Years,M,,United Kingdom (UK),Nairobi,,No,,,,,2021-11-29 00:00:00,,2021-11-29 00:00:00


### **Clean Age**

In [32]:
df_name[df_name['AGE_UNIT'] == 'Days']

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
19700,KSCRH20/9/240,COVC24532,6,Days,M,,Kenya,Kisii,,No,,,FC,2021-09-21 00:00:00,2021-09-23 00:00:00,Positive,2021-09-23 00:00:00


In [33]:
df_months = df_name[df_name['AGE_UNIT'] == 'Months']

In [34]:
df_months.head()

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
733,NMS/EM/100-138,COVC09555,11,Months,F,,Kenya,Narok,RONGAI,No,,,,2020-09-01 00:00:00,,Negative,2020-09-02 00:00:00
1358,99/Thika Women Prison,COVC02581,1,Months,M,Inmate,Uganda,Kiambu,Thika,N,NIL,Prison,NIL,2020-07-01 00:00:00,,Negative,2020-07-03 00:00:00
1786,LUS/07,COVC17631,10,Months,F,,Kenya,Kiambu,KIKUYU,No,,,,2020-10-27 00:00:00,2020-10-03 00:00:00,Negative,2020-11-03 00:00:00
2637,TKA/TL5H/033,COVC18026,6,Months,M,,Kenya,Kiambu,THIKA,No,,,CO,2020-11-03 00:00:00,2020-11-04 00:00:00,Positive,2020-11-04 00:00:00
2794,TIG/COV/918,COVC18183,19,Months,M,Child,Kenya,Kiambu,LIMURU,No,,,,2020-11-03 00:00:00,2020-11-04 00:00:00,Negative,2020-11-04 00:00:00


In [35]:
df_mn2yrs = df_months.assign(AGE=df_months['AGE'].map(lambda x: round(x / 12, 1))).replace('Months', 'Years')

In [36]:
df_mn2yrs.head()

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
733,NMS/EM/100-138,COVC09555,0.9,Years,F,,Kenya,Narok,RONGAI,No,,,,2020-09-01 00:00:00,,Negative,2020-09-02
1358,99/Thika Women Prison,COVC02581,0.1,Years,M,Inmate,Uganda,Kiambu,Thika,N,NIL,Prison,NIL,2020-07-01 00:00:00,,Negative,2020-07-03
1786,LUS/07,COVC17631,0.8,Years,F,,Kenya,Kiambu,KIKUYU,No,,,,2020-10-27 00:00:00,2020-10-03 00:00:00,Negative,2020-11-03
2637,TKA/TL5H/033,COVC18026,0.5,Years,M,,Kenya,Kiambu,THIKA,No,,,CO,2020-11-03 00:00:00,2020-11-04 00:00:00,Positive,2020-11-04
2794,TIG/COV/918,COVC18183,1.6,Years,M,Child,Kenya,Kiambu,LIMURU,No,,,,2020-11-03 00:00:00,2020-11-04 00:00:00,Negative,2020-11-04


In [37]:
df_name[df_name['AGE_UNIT'] == 'Days']#.AGE_UNIT.unique()

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
19700,KSCRH20/9/240,COVC24532,6,Days,M,,Kenya,Kisii,,No,,,FC,2021-09-21 00:00:00,2021-09-23 00:00:00,Positive,2021-09-23 00:00:00


In [38]:
df_not_months = df_name[df_name['AGE_UNIT'] != 'Months']

In [39]:
df_not_months.shape

(27694, 17)

In [40]:
df_mn2yrs.shape

(77, 17)

In [41]:
df_years = df_not_months.append(df_mn2yrs)

In [42]:
df_years.shape

(27771, 17)

In [43]:
df_years['AGE_UNIT'].unique()

array(['Years', 'YEARS', nan, 'years', 'Nil', 'Days'], dtype=object)

In [44]:
df_years[df_years['AGE_UNIT'] == 'NO PATIENT DATASHEET']

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF


In [45]:
df_years[df_years['AGE_UNIT'].str.contains('M', 'F') == True]

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF


In [46]:
df_years[df_years['AGE_UNIT'] == 'Nil']

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
15615,15/07/RD01,COVC04083,Nil,Nil,M,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00
15616,15/07/RD02,COVC04084,Nil,Nil,M,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00
15617,15/07/RD03,COVC04085,Nil,Nil,M,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Positive,2020-07-17 00:00:00
15618,15/07/04,COVC04086,Nil,Nil,M,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00
15619,15/07/06,COVC04087,Nil,Nil,M,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00
15620,15/07/05,COVC04088,Nil,Nil,F,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00
15621,15/07/R07,COVC04089,Nil,Nil,M,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00
15622,15/07/R08,COVC04090,Nil,Nil,F,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00
15623,15/07/R09,COVC04091,Nil,Nil,M,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00
15624,15/07/R10,COVC04092,Nil,Nil,F,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,Nil,,Negative,2020-07-17 00:00:00


In [47]:
df_years_drop_age_unit = df_years.drop('AGE_UNIT', axis=1)

In [48]:
df_years_rencol = df_years_drop_age_unit.rename(columns={'AGE': 'AGE_YRS'})

In [49]:
df_years_rencol.head()

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
0,NMS/EM/01F,COVC22771,58,M,HCW,Kenya,Nairobi,LAVINGTON,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
1,,COVC22772,63,M,CEC,Kenya,Kiambu,THIKA,No,,,FLU,2020-12-31 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
2,,COVC22773,53,F,Psychologist,Kenya,Kiambu,THIKA,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
3,TL5H/TKA/001,COVC22774,52,F,Business,Kenya,Kiambu,THIKA,No,,,CO;DIB,2021-01-02 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
4,,COVC22775,41,F,HCW,Kenya,Kiambu,GATUNDU NORTH,No,,,CO;DIB,2021-01-03 00:00:00,2021-01-01 00:00:00,Positive,2021-01-01 00:00:00


In [50]:
df_years_rencol['AGE_YRS'].unique()

array([58, 63, 53, 52, 41, 9, 6, 39, 23, 36, 33, 42, 34, 32, 57, 56, 27,
       20, nan, 44, 26, 31, 35, 28, 37, 54, 29, 43, 24, 38, 47, 49, 40,
       51, 50, 25, 30, 45, 46, 22, 21, 48, 55, 65, 64, 60, 18, 16, 10, 17,
       15, 8, 13, 5, 12, 84, 59, 62, 68, 312, 61, 19, 69, 79, 70, 66, 2,
       86, 80, 72, 11, 88, 3, 4, 7, 74, 67, 87, 3.6, 1, 14, 1.5, 2.42,
       'None indicated', 73, 6.5, 2.5, 75, 97, 'Nil', 71, 89, 96, 82, 77,
       'Not indicated', 743, 78, 76, 'None', 9.5, 83, 'nil', 81, 98,
       'none indicated', 90, 85, 91, 100, 101, 92, 'not_indicated', 5.5,
       'Not Indicated', '37', '36', '40', '38', '29', 94, 105, 0.5,
       'not Indicated', '30', '44', '33', '27', '26', '51', '28',
       'None Indicated', 542, 0.8, 0.9, 0.1, 1.6, 1.1, 2.9, 3.7, 0.2, 1.7,
       0.3, 2.8, 0.7, 1.2, 2.1, 0.4, 4.5, 1.8, 0.6], dtype=object)

In [51]:
[x for x in df_years_rencol['AGE_YRS'].unique() if isinstance(x, str)]

['None indicated',
 'Nil',
 'Not indicated',
 'None',
 'nil',
 'none indicated',
 'not_indicated',
 'Not Indicated',
 '37',
 '36',
 '40',
 '38',
 '29',
 'not Indicated',
 '30',
 '44',
 '33',
 '27',
 '26',
 '51',
 '28',
 'None Indicated']

In [52]:
to_rep = ['None indicated',
 'Nil',
 'Not indicated',
 'None',
 'nil',
 'none indicated',
 'not_indicated',
 'Not Indicated',
 'not Indicated',
 'None Indicated',
 'NO PATIENT DATASHEET'
]

In [53]:
df_years_repnan2float = df_years_rencol.assign(AGE_YRS=df_years_rencol['AGE_YRS'].replace(to_rep, np.nan).astype(float))

In [54]:
df_years_repnan2float

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
0,NMS/EM/01F,COVC22771,58.0,M,HCW,Kenya,Nairobi,LAVINGTON,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
1,,COVC22772,63.0,M,CEC,Kenya,Kiambu,THIKA,No,,,FLU,2020-12-31 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
2,,COVC22773,53.0,F,Psychologist,Kenya,Kiambu,THIKA,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
3,TL5H/TKA/001,COVC22774,52.0,F,Business,Kenya,Kiambu,THIKA,No,,,CO;DIB,2021-01-02 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
4,,COVC22775,41.0,F,HCW,Kenya,Kiambu,GATUNDU NORTH,No,,,CO;DIB,2021-01-03 00:00:00,2021-01-01 00:00:00,Positive,2021-01-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23855,GSH/598,COVC16100,1.0,F,,Kenya,Nakuru,GILGIL,No,,,,2020-10-23 00:00:00,2020-10-24 00:00:00,Negative,2020-10-28 00:00:00
24453,LKP/NCRH/2121,COVC16315,0.9,M,,Kenya,Laikipia,LAIKIPIA WEST,No,,,FC;CO;D,2020-10-25 00:00:00,2020-10-27 00:00:00,Negative,2020-10-29 00:00:00
25040,NMS/EM/11065,COVC16833,0.8,M,,Kenya,Nairobi,DONHOLM,No,,,,2020-10-29 00:00:00,2020-10-29 00:00:00,Negative,2020-10-30 00:00:00
25296,MSA/LKN/10/159,COVC17089,0.6,M,,Kenya,Mombasa,LIKONI,No,,,CO,2020-10-22 00:00:00,2020-10-27 00:00:00,Negative,2020-10-30 00:00:00


### **Clean Gender**

In [55]:
df_years_repnan2float['GEND'].unique()

array(['M', 'F', 'f', nan, 'Female', 'Male', 'F`', 'MALE ', 'MALE', 'Nil',
       'T', 'm'], dtype=object)

In [56]:
df_years_repnan2float[df_years_repnan2float['GEND'].isna()].sort_values('S_NUM')

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
10317,,COVC00749,42.0,,Nil,Kenyan,Laikipia,Laikipia East,N,,Nanyuki Teaching and Referral Hospital,Asymtomatic,9th June 2020,"June 11, 2020",Neg,2020-06-12 00:00:00
19509,NYR221,COVC04494,,,,Kenya,Nyeri,,No,,,,2020-07-14 00:00:00,,Negative,2020-07-23 00:00:00
19510,NYR221,COVC04495,,,,Kenya,Nyeri,,No,,,,2020-07-14 00:00:00,,Negative,2020-07-23 00:00:00
19511,NYR221,COVC04496,,,,Kenya,Nyeri,,No,,,,2020-07-14 00:00:00,,Negative,2020-07-23 00:00:00
25590,NYR-1348-17,COVC05236,,,,,Nyeri,,,,,,2020-07-28 00:00:00,,Negative,2020-07-31 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26745,NPHL-12780,NPHL-12780,,,,Kenya,,,No,,,,,07-04-2021,Positive,19-04-2021
26749,NPHL1,NPHL1,,,,Kenya,,,No,,,,,07-04-2021,Positive,19-04-2021
26750,NPHL2,NPHL2,,,,Kenya,,,No,,,,,07-04-2021,Positive,19-04-2021
26751,NPHL3,NPHL3,,,,Kenya,,,No,,,,,07-04-2021,Positive,19-04-2021


In [57]:
df_years_repnan2float[df_years_repnan2float['GEND'] == 'Nil']

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
11940,,COVC00749,42.0,Nil,Nil,Kenyan,Laikipia,Laikipia East,N,,Nanyuki Teaching and Referral Hospital,Asymtomatic,9th June 2020,"June 11, 2020",Neg,2020-06-12 00:00:00
17278,Nil,COVC01628,,Nil,Nil,Nil,Nyeri,,No,,,,,,ms,2020-06-19 00:00:00


COVEs:
- COVC13844
- COVC13845
- COVC13846
- COVC13847
- COVC13848
- COVC13849
- COVC13850
- COVC13851
- COVC13852
- COVC13853
- COVC16256
- COVC16257

In [58]:
df_years_repgender = df_years_repnan2float.assign(GEND=df_years_repnan2float['GEND'].replace(['f', 'Female', 'F`', 'T'], 'F')
                                                 .replace(['Male', 'MALE ', 'MALE'], 'M'))

In [59]:
df_years_repgender['GEND'].unique()

array(['M', 'F', nan, 'Nil', 'm'], dtype=object)

In [60]:
df_years_repgender['GEND'].replace('Nil', np.nan, inplace=True)

In [61]:
df_years_repgender['GEND'].unique()

array(['M', 'F', nan, 'm'], dtype=object)

### **Clean Occupation**

In [62]:
df_years_repgender['OCCU'].unique()

array(['HCW', 'CEC', 'Psychologist', ..., 'AU Representative',
       'Entertainer', 'Child accompanying inmate mothe'], dtype=object)

In [63]:
df_years_repgender['OCCU'] = df_years_repgender['OCCU'].replace(['Nil','nil'], np.nan)

In [64]:
# series = pd.Series(df_years_repgender['OCCU'].str.strip().str.lower().unique()).to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-occupations.xlsx", index=False)
# series

### **Clean Nationality**

In [65]:
df_years_repgender['NAT'].unique()

array(['Kenya', 'Ethiopia', 'Somalia', 'Uganda', 'South Africa', 'India',
       'Canada', 'Tanzania', 'Kenyan', 'Zambia', 'Burundi',
       'United States of America (USA)', 'Germany', 'Sudan',
       'South Sudan', nan, 'Lebanon', 'Sweden', 'Nigeria', 'Benin',
       'United Kingdom (UK)', 'Netherlands', 'Cuba', 'Indian', 'Ghana',
       'Zimbabwe', 'Rwanda', 'Mauritania', 'Denmark', 'Pakistan',
       'Congo, Democratic Republic of the', 'Brazil', 'Tanzanian',
       'Italy', 'Australia', 'Colombia', 'Hungary', 'Djibouti', 'Finland',
       'Senegal', 'kenya', 'Yemen', 'Turkey', 'Philippines', 'Japan',
       'Ireland', 'Tunisia', 'Switzerland', 'Togo', 'Cameroon',
       'Burkina Faso', 'Portugal', 'Bangladesh', 'Madagascar',
       "Cote d'Ivoire", 'Egypt', 'China', 'KENYAN', 'Greece', 'Bulgaria',
       'Nil', 'Mozambique', 'Mali', 'Eritrea', 'Austria', 'Malawi',
       'France', 'Namibia', 'Congo, Republic of the', 'Spain',
       'Sri Lanka', 'United States', 'Jordan',
       '

In [66]:
df_years_repnat = df_years_repgender.assign(NAT=df_years_repgender['NAT'].replace('Nil', 'Unknown')
                                            .replace(['United States', 'Indian'
                                                      , 'Congo, Democratic Republic of the'
                                                     , 'Congo, Republic of the'] 
                                                     , ['United States of America (USA)','India'
                                                        , 'Democratic Republic of the Congo',
                                                       'Republic of the Congo'])
                                            .map(lambda x: str(x).replace(x, 'Kenya') if ('enya' in str(x).lower()) else x))

In [67]:
df_years_repnat['NAT'].unique()

array(['Kenya', 'Ethiopia', 'Somalia', 'Uganda', 'South Africa', 'India',
       'Canada', 'Tanzania', 'Zambia', 'Burundi',
       'United States of America (USA)', 'Germany', 'Sudan',
       'South Sudan', nan, 'Lebanon', 'Sweden', 'Nigeria', 'Benin',
       'United Kingdom (UK)', 'Netherlands', 'Cuba', 'Ghana', 'Zimbabwe',
       'Rwanda', 'Mauritania', 'Denmark', 'Pakistan',
       'Democratic Republic of the Congo', 'Brazil', 'Tanzanian', 'Italy',
       'Australia', 'Colombia', 'Hungary', 'Djibouti', 'Finland',
       'Senegal', 'Yemen', 'Turkey', 'Philippines', 'Japan', 'Ireland',
       'Tunisia', 'Switzerland', 'Togo', 'Cameroon', 'Burkina Faso',
       'Portugal', 'Bangladesh', 'Madagascar', "Cote d'Ivoire", 'Egypt',
       'China', 'Greece', 'Bulgaria', 'Unknown', 'Mozambique', 'Mali',
       'Eritrea', 'Austria', 'Malawi', 'France', 'Namibia',
       'Republic of the Congo', 'Spain', 'Sri Lanka', 'Jordan'],
      dtype=object)

### **Clean County of Res**

In [68]:
df_years_repnat.head()

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
0,NMS/EM/01F,COVC22771,58.0,M,HCW,Kenya,Nairobi,LAVINGTON,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
1,,COVC22772,63.0,M,CEC,Kenya,Kiambu,THIKA,No,,,FLU,2020-12-31 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
2,,COVC22773,53.0,F,Psychologist,Kenya,Kiambu,THIKA,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
3,TL5H/TKA/001,COVC22774,52.0,F,Business,Kenya,Kiambu,THIKA,No,,,CO;DIB,2021-01-02 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
4,,COVC22775,41.0,F,HCW,Kenya,Kiambu,GATUNDU NORTH,No,,,CO;DIB,2021-01-03 00:00:00,2021-01-01 00:00:00,Positive,2021-01-01 00:00:00


In [69]:
counties = (df_years_repnat['COUNT_RES']
                   .replace(['Nil', 'nan', '', 'Nan', 'Muranga ', 'Murang`a ', 'Niarobi']
                            , ['Unknown', 'Unknown', 'Unknown', 'Unknown', "Murang'a", "Murang'a", 'Nairobi'])
                   .apply(lambda x: str(x).strip().capitalize() if (isinstance(x, str)) else x)).fillna('Unknown')

In [70]:
'Nil' in list(counties)#.unique()

False

In [71]:
df_years_county = (df_years_repnat.assign(COUNT_RES=counties))#.assign(S_COUNT_RES=s_counties)

In [72]:
df_years_county['COUNT_RES'].unique()

array(['Nairobi', 'Kiambu', 'Kajiado', 'Machakos', 'Laikipia', 'Marsabit',
       'Makueni', "Murang'a", 'Isiolo', 'Migori', 'Mwanza', 'Zambia',
       'Burundi', 'Bukoba', 'Mombasa', 'Tanzania', 'Kisii', 'Nyamira',
       'Unknown', 'Garissa', 'Homabay', 'Nakuru', 'Samburu', 'Nyeri',
       'Tana river', 'Kitui', 'Wajir', 'Nyandarua', 'Kirinyaga', 'Meru',
       'Uasin gishu', 'Siaya', 'Bungoma', 'Kakamega', 'Nandi', 'Kilifi',
       'Kwale', 'Uganda', 'Mandera', 'Kampala', 'Kisumu', 'Badiere',
       'Embu', 'Busia', 'Kigali', 'Mukono/uganda', 'Congo', 'Vihiga',
       'Narok', 'Kericho', 'Trans nzoia', 'Elgeyo marakwet',
       'Taita taveta', 'Bomet', 'Bukavu', 'Thika', 'Tharakanithi'],
      dtype=object)

### **Clean Subcounty of Res**

In [73]:
len(df_years_repnat['S_COUNT_RES'].str.lower().unique())

954

In [74]:
len((df_years_repnat['S_COUNT_RES']
     .replace(['Nil', 'nan', '', 'Nan', 'Muranga ', 'Murang`a ']
                            , [np.nan, np.nan, np.nan, np.nan, "Murang'a", "Murang'a"])
                   .apply(lambda x: str(x).strip().capitalize())
     .unique()))

878

In [75]:
s_counties = (df_years_repnat['S_COUNT_RES']
     .replace(['Nil', 'nan', '', 'Nan', 'Muranga ', 'Murang`a ']
                            , ['Unknown', 'Unknown', 'Unknown', 'Unknown', "Murang'a", "Murang'a"])
                   .apply(lambda x: str(x).replace(str(x), 'Unknown') if ('indicate' in str(x)) else str(x).strip().capitalize())
              .fillna('Unknown'))

In [76]:
# sorted([s for s in s_counties.unique()])

In [77]:
df_years_scounty = df_years_county.assign(S_COUNT_RES=s_counties)

### **Clean Travel History**

In [78]:
df_years_scounty.head()

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
0,NMS/EM/01F,COVC22771,58.0,M,HCW,Kenya,Nairobi,Lavington,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
1,,COVC22772,63.0,M,CEC,Kenya,Kiambu,Thika,No,,,FLU,2020-12-31 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
2,,COVC22773,53.0,F,Psychologist,Kenya,Kiambu,Thika,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
3,TL5H/TKA/001,COVC22774,52.0,F,Business,Kenya,Kiambu,Thika,No,,,CO;DIB,2021-01-02 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
4,,COVC22775,41.0,F,HCW,Kenya,Kiambu,Gatundu north,No,,,CO;DIB,2021-01-03 00:00:00,2021-01-01 00:00:00,Positive,2021-01-01 00:00:00


In [79]:
df_years_scounty['TRAV_HIST'].unique()

array(['No', nan, 'Yes', 'Y', 'N', 'JH', 'no', 'NO ', 'Nil', 'YES',
       'NONE ', 'NONE', 'y', 'yes', 15.93], dtype=object)

In [80]:
df_travl = df_years_scounty.assign(TRAV_HIST=df_years_scounty['TRAV_HIST'].replace([np.nan, 'N', 'no', 'NO', 'Nil', 'NONE ', 'NONE'], 'No')
           .replace(['YES', 'Y', 'y', 'yes'], 'Yes'))


In [81]:
df_travl['TRAV_FROM'].unique()

array([nan, 'ELDORET', 'TANZANIA', 'Kisii', 'Nyamira', 'Nairobi',
       'Kendu-Bay;Kisii;Kisumu;Homabay;Oyugis', 'Mumias, Oyugis, Homabay',
       'NAIVASHA', 'NAIROBI;NYERI;NANYUKI', 'NIL', 'Ruaka', 'Ngoingwa',
       'Limuru', 'Thika', 'Embakasi', 'Uganda', 'Dubai', 'Nil',
       'Not indicated on form ', 'TORORO UGANDA', 'UGANDA', 'AWASI',
       'KIGALI', 'ENTEBE', 'NAIROBI', 'JINJA', 'KISUMU', 'KAMPALA',
       'KASESE', 'KILIFI COUNTY', 'KWALE COUNTY', 'Mwanza', 'Mombasa',
       'Geita', 'Isebania', 'Kilimanjaro', 'Migori', 'KENDU BAY',
       'KISUMU, KITALE', 'HOMABAY', 'KISII, KISUMU', 'HOMABAY, KISII',
       'NYAHURURU; NANYUKI', 'TORORO', 'KAMPALA UGANDA', 'RWANDA',
       'JINJA UGANDA', 'KAJIADO', 'NOT APPLICABLE ', 'SOUTH SUDAN ',
       'NYERI', 'NAIROBI,KISUMU,AWASI', 'OYUGIS', 'KWALE;KILIFI',
       'KAKAMEGA', 'KWALE', 'Eldoret', 'Malindi', 'Uasin Gishu', 'N',
       'Machakos', 'No', 'KILIFI/NAIROBI', 'NANYUKI SAGANA',
       'THARAKA NITHI', 'SHIMBA HILLS', 'KILI

In [82]:
df_travl2 = (df_travl.assign(TRAV_FROM=df_travl['TRAV_FROM'].replace(['nan','NIL', 'Nil', 'Not indicated on form '
                  , 'NOT APPLICABLE ', 'N', 'N', 'No', 'not_indicated', 'Not applicable', 'Not indicated'],np.nan)
                    .replace(['Mombasa (stayed in mtwapa for 4 days)', 'KINANGO MSAMBWENI', 'Nan'
                              , 'Mombasa (Stayed in Mtwapa for 4 days)'], 
                    ['Mtwapa', 'Kinangop;Msambweni', np.nan, 'Mtwapa'])
                    .apply(lambda x: str(x).replace('/', ';')
                    .replace(' and ', ';')
                    .replace('-', ';')
                    .replace(',', ';')
                    .replace(' ; ', ';')
                    .replace('; ', ';')
                    .strip().capitalize() if (isinstance(x, str)) else x)))

In [83]:
df_travl2['TRAV_FROM'].unique()



array([nan, 'Eldoret', 'Tanzania', 'Kisii', 'Nyamira', 'Nairobi',
       'Kendu;bay;kisii;kisumu;homabay;oyugis', 'Mumias;oyugis;homabay',
       'Naivasha', 'Nairobi;nyeri;nanyuki', 'Ruaka', 'Ngoingwa', 'Limuru',
       'Thika', 'Embakasi', 'Uganda', 'Dubai', 'Tororo uganda', 'Awasi',
       'Kigali', 'Entebe', 'Jinja', 'Kisumu', 'Kampala', 'Kasese',
       'Kilifi county', 'Kwale county', 'Mwanza', 'Mombasa', 'Geita',
       'Isebania', 'Kilimanjaro', 'Migori', 'Kendu bay', 'Kisumu;kitale',
       'Homabay', 'Kisii;kisumu', 'Homabay;kisii', 'Nyahururu;nanyuki',
       'Tororo', 'Kampala uganda', 'Rwanda', 'Jinja uganda', 'Kajiado',
       'South sudan', 'Nyeri', 'Nairobi;kisumu;awasi', 'Oyugis',
       'Kwale;kilifi', 'Kakamega', 'Kwale', 'Malindi', 'Uasin gishu',
       'Machakos', 'Kilifi;nairobi', 'Nanyuki sagana', 'Tharaka nithi',
       'Shimba hills', 'Kilifi', 'Netherlands', 'Nairobi;usa', 'Usa',
       'Homa;bay;kisumu;nairobi;kerugoya;kutus', 'Kimilili;maragua',
       'Tave

In [84]:
df_travl2.head()

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
0,NMS/EM/01F,COVC22771,58.0,M,HCW,Kenya,Nairobi,Lavington,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
1,,COVC22772,63.0,M,CEC,Kenya,Kiambu,Thika,No,,,FLU,2020-12-31 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
2,,COVC22773,53.0,F,Psychologist,Kenya,Kiambu,Thika,No,,,,2021-01-01 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
3,TL5H/TKA/001,COVC22774,52.0,F,Business,Kenya,Kiambu,Thika,No,,,CO;DIB,2021-01-02 00:00:00,2021-01-01 00:00:00,Negative,2021-01-01 00:00:00
4,,COVC22775,41.0,F,HCW,Kenya,Kiambu,Gatundu north,No,,,CO;DIB,2021-01-03 00:00:00,2021-01-01 00:00:00,Positive,2021-01-01 00:00:00


### **Clean Symptoms**

- GW: General weakness
- FC: Fever/Chills
- CO: Cough
- ST: Sore throat
- RN: Runny nose
- SB: Shortness of breath/Difficulty in breathing
- D: Diarrhoea
- NV: Nausea/Vomiting
- H: Headache
- IC: Irritability/Confusion
- P: Pain
- LS: Loss of smell
- LT: Loss of taste
- P-M: Muscular pain
- P-A: Abdominal pain
- P-B: Back pain
- P-C: Chest pain
- P-J: Joint pain
- BA: Body aches
- SZ: Sneezing
- HP: Hypertension

In [85]:
df_travl2['SYMPS'] = (df_travl2['SYMPS'].apply(lambda x: str(x).replace(str(x), 'NA') if ('symtom' in str(x)) else str(x).replace('F/C', 'FC'))
                      .replace(['[=]','nan'], np.nan))
pd.Series(df_travl2['SYMPS']).to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-symptoms.xlsx", index=False)

### **Clean Dates**

In [86]:
def my_date_parser(x):
    try:
        if type(x) == pd.Timestamp:
            x = x#.strftime('%d-%m-%Y')
        if type(x) != pd.Timestamp and type(x) != (int, float, pd.NaT):
            x = pd.to_datetime(x, errors='ignore', dayfirst=True)#.strftime('%d-%m-%Y')
        if x in ['None indicated', 'NIL', 'Not indicated on form ',
                    'Nil', 'None indicated', 'Leaked Sample - Empty', 'nan']:
            x = pd.NaT
    except (ValueError, AttributeError):
        x = pd.NaT
    return x

In [87]:
df_dt1 = df_travl2.assign(DT_SAM_COLL=df_travl2['DT_SAM_COLL'].apply(lambda x: my_date_parser(x)))

In [88]:
df_dt2 = df_dt1.assign(DT_SAM_RECEP=df_dt1['DT_SAM_RECEP'].apply(lambda x: my_date_parser(x)))

In [89]:
df_dts_cln = df_dt2.assign(DT_CONF=df_dt2['DT_CONF'].apply(lambda x: my_date_parser(x)))

In [90]:
df_dts_cln.head()

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
0,NMS/EM/01F,COVC22771,58.0,M,HCW,Kenya,Nairobi,Lavington,No,,,,2021-01-01,2021-01-01,Negative,2021-01-01
1,,COVC22772,63.0,M,CEC,Kenya,Kiambu,Thika,No,,,FLU,2020-12-31,2021-01-01,Negative,2021-01-01
2,,COVC22773,53.0,F,Psychologist,Kenya,Kiambu,Thika,No,,,,2021-01-01,2021-01-01,Negative,2021-01-01
3,TL5H/TKA/001,COVC22774,52.0,F,Business,Kenya,Kiambu,Thika,No,,,CO;DIB,2021-01-02,2021-01-01,Negative,2021-01-01
4,,COVC22775,41.0,F,HCW,Kenya,Kiambu,Gatundu north,No,,,CO;DIB,2021-01-03,2021-01-01,Positive,2021-01-01


### **Clean Results**

In [91]:
df_dts_cln['RESULT'].unique()

array(['Negative', 'Positive', 'positive', 'neg', 'pos', 'ms', 'negative',
       'Neg', 'Pos', 'MS', 'Positve', 'Positive ', nan, 'POSITIVE',
       'M2000'], dtype=object)

In [92]:
df_dts_cln['RESULT'].map(lambda x: x.strip().lower() if (isinstance(x, str)) else x).unique()

array(['negative', 'positive', 'neg', 'pos', 'ms', 'positve', nan,
       'm2000'], dtype=object)

In [93]:
def cln_results(x):
    if isinstance(x, str):
        x = x.strip().lower()
        if 'neg' in x:
            x = x.replace(x, 'Negative')
        elif 'pos' in x:
            x = x.replace(x, 'Positive')
        elif x == 'inconclusive':
            x = x.replace(x, 'Negative')
    return x
        

In [94]:
df_dts_cln['RESULT'] = df_dts_cln['RESULT'].map(lambda x: cln_results(x))

In [95]:
df_dts_cln['RESULT'].unique()

array(['Negative', 'Positive', 'ms', nan, 'm2000'], dtype=object)

In [96]:
df_dts_cln.head(2)

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
0,NMS/EM/01F,COVC22771,58.0,M,HCW,Kenya,Nairobi,Lavington,No,,,,2021-01-01,2021-01-01,Negative,2021-01-01
1,,COVC22772,63.0,M,CEC,Kenya,Kiambu,Thika,No,,,FLU,2020-12-31,2021-01-01,Negative,2021-01-01


### **Remove Duplicates**

In [97]:
df_cln_srt = df_dts_cln.sort_values(['S_NUM', 'DT_CONF'])

In [98]:
df_cln_srt[df_cln_srt.duplicated(['S_NUM', 'DT_CONF'], keep=False) == True]

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
10276,,COVC00708,35.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
11899,,COVC00708,35.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
10277,,COVC00709,32.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
11900,,COVC00709,32.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
10278,,COVC00710,45.0,M,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,COVE1027,COVC24610,69.0,M,,Canada,Nairobi,Uthiru,No,,,,2021-10-01,2021-10-01,Negative,2021-10-01
414,COVE1028,COVC24611,43.0,F,,Kenya,Nairobi,Nairobi,No,,,,2021-10-01,2021-10-01,Negative,2021-10-01
416,COVE1028,COVC24611,43.0,F,,Kenya,Nairobi,Nairobi,No,,,,2021-10-01,2021-10-01,Negative,2021-10-01
26494,,KEM-21-03-94425,17.0,M,,Kenya,Kiambu,Thika,No,,,"Cough, Runny nose",2021-03-04,NaT,Positive,2021-03-11


In [99]:
df_cln_srt[df_cln_srt.duplicated(keep=False) == True]

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
10276,,COVC00708,35.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
11899,,COVC00708,35.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
10277,,COVC00709,32.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
11900,,COVC00709,32.0,F,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
10278,,COVC00710,45.0,M,,Kenya,Laikipia,Laikipia east,No,,Nanyuki Teaching and Referral Hospital,,2020-06-09,2020-06-11,Negative,2020-06-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,COVE1027,COVC24610,69.0,M,,Canada,Nairobi,Uthiru,No,,,,2021-10-01,2021-10-01,Negative,2021-10-01
414,COVE1028,COVC24611,43.0,F,,Kenya,Nairobi,Nairobi,No,,,,2021-10-01,2021-10-01,Negative,2021-10-01
416,COVE1028,COVC24611,43.0,F,,Kenya,Nairobi,Nairobi,No,,,,2021-10-01,2021-10-01,Negative,2021-10-01
26494,,KEM-21-03-94425,17.0,M,,Kenya,Kiambu,Thika,No,,,"Cough, Runny nose",2021-03-04,NaT,Positive,2021-03-11


In [100]:
df_dedud = df_cln_srt.drop_duplicates(['S_NUM', 'DT_CONF'], keep='first')#['COUNT_RES'].fillna('Unknown', inplace=True)

### **QC Dates**

In [101]:
sr_date_diff = df_dedud['DT_CONF'] - df_dedud['DT_SAM_COLL']

In [102]:
mask1 = sr_date_diff >= '60D'
mask2 = sr_date_diff != pd.NaT
mask3 = df_dedud['DT_SAM_COLL'] > df_dedud['DT_CONF']
# mask3 = df_dedud['DT_CONF']

In [103]:
df_dedud[mask1 == True]

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
10483,047/NTRH/005,COVC06893,50.0,F,Health Care Worker,Kenya,Nairobi,Mathare,No,,None Indicated,,2020-05-10,NaT,Negative,2020-08-12
10771,NMS/EM/100-240,COVC09826,30.0,M,PLUMBER,Kenya,Kwale,Samburu,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09
10772,NMS/EM/100-241,COVC09827,39.0,F,BUSSINESS,Kenya,Kiambu,Kabete,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09
10773,NMS/EM/100-517,COVC09828,24.0,M,BUSSINESS,Kenya,Nairobi,Iindustrial area,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09
10774,NMS/EM/100-516,COVC09829,25.0,M,BUSSINESS,Kenya,Nairobi,Kariobangi south,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09
10775,NMS/EM/100-515,COVC09830,36.0,F,STEWARD,Kenya,Kajiado,Kitengela,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09
10776,NMS/EM/100-518,COVC09831,19.0,M,STUDENT,Kenya,Nairobi,Kibra,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09
10777,NMS/EM/100-515,COVC09832,28.0,F,BUSSINESS,Kenya,Nairobi,Langata,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09
10778,NMS/EM/100-512,COVC09833,25.0,F,STEWARD,Kenya,Nairobi,Kawangware,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09
10779,NMS/EM/100-517,COVC09834,47.0,M,BUSSINESS,Kenya,Nairobi,Kawangware,No,,,,2020-10-09,2020-11-09,Negative,2020-12-09


### **Export Cleaned Data**

In [104]:
df_dedud.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-cln.xlsx", index=False)

In [105]:
df_dedud[df_dedud['RESULT'] == 'Positive'].to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-cln-pos.xlsx", index=False)

In [106]:
df_dedud[df_dedud['S_NUM'].str.contains('COVC') == False]

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
9936,COVE20,COVE20,39.0,M,SECURITY GUARD,Kenya,Machakos,Athi river,No,,,,2020-09-10,2020-09-10,Negative,2020-09-11
9937,COVE21,COVE21,50.0,M,SECURITY GUARD,Kenya,Machakos,Athi river,No,,,,2020-09-10,2020-09-10,Negative,2020-09-11
26790,NPHL03/BM003,COVM00449,29.0,M,,,Nairobi,0,No,,,,2021-08-08,2021-09-15,Positive,2021-08-08
26791,NPHL03/JSC12,COVM00450,42.0,F,,,Nairobi,0,No,,,,2021-08-08,2021-09-15,Positive,2021-08-08
26792,NPHL03/RRT6/77,COVM00451,,,,,Nairobi,0,No,,,,2021-08-08,2021-09-15,Positive,2021-08-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26745,NPHL-12780,NPHL-12780,,,,Kenya,Unknown,Nan,No,,,,NaT,2021-04-07,Positive,2021-04-19
26749,NPHL1,NPHL1,,,,Kenya,Unknown,Nan,No,,,,NaT,2021-04-07,Positive,2021-04-19
26750,NPHL2,NPHL2,,,,Kenya,Unknown,Nan,No,,,,NaT,2021-04-07,Positive,2021-04-19
26751,NPHL3,NPHL3,,,,Kenya,Unknown,Nan,No,,,,NaT,2021-04-07,Positive,2021-04-19


In [107]:
df_dedud[df_dedud['S_NUM'].str.contains('COVC') == False]

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF
9936,COVE20,COVE20,39.0,M,SECURITY GUARD,Kenya,Machakos,Athi river,No,,,,2020-09-10,2020-09-10,Negative,2020-09-11
9937,COVE21,COVE21,50.0,M,SECURITY GUARD,Kenya,Machakos,Athi river,No,,,,2020-09-10,2020-09-10,Negative,2020-09-11
26790,NPHL03/BM003,COVM00449,29.0,M,,,Nairobi,0,No,,,,2021-08-08,2021-09-15,Positive,2021-08-08
26791,NPHL03/JSC12,COVM00450,42.0,F,,,Nairobi,0,No,,,,2021-08-08,2021-09-15,Positive,2021-08-08
26792,NPHL03/RRT6/77,COVM00451,,,,,Nairobi,0,No,,,,2021-08-08,2021-09-15,Positive,2021-08-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26745,NPHL-12780,NPHL-12780,,,,Kenya,Unknown,Nan,No,,,,NaT,2021-04-07,Positive,2021-04-19
26749,NPHL1,NPHL1,,,,Kenya,Unknown,Nan,No,,,,NaT,2021-04-07,Positive,2021-04-19
26750,NPHL2,NPHL2,,,,Kenya,Unknown,Nan,No,,,,NaT,2021-04-07,Positive,2021-04-19
26751,NPHL3,NPHL3,,,,Kenya,Unknown,Nan,No,,,,NaT,2021-04-07,Positive,2021-04-19


11-09-2020

COVC9774
COVC9775
COVE20
COVE21
COVC9776
COVC9777


### **ID Misssing Data**

In [108]:
start = 0
df_missing = pd.DataFrame(columns=cols_rename)
for num in df_dedud['S_NUM'].sort_values():
#     curr_row = df_dts_cln[df_dts_cln['S_NUM']]
    try:
        curr = int(num.lstrip('COVC').lstrip('0'))
        
        if curr - start > 1:
            start_id = f'COVC{start}'
            cov_id = f'COVC{curr}'
            df_missing = df_missing.append(df_dts_cln[df_dts_cln['S_NUM'] == cov_names(start_id)])
            df_missing = df_missing.append(df_dts_cln[df_dts_cln['S_NUM'] == cov_names(cov_id)])
        start = curr
    except ValueError:
#         print(f'ValueError: {num}')
        pass

In [109]:
df_missing.head()

Unnamed: 0,CASE_ID,S_NUM,AGE,AGE_UNIT,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,AGE_YRS
12969,,COVC01023,,,M,Food handler,Kenya,Nairobi,Embakasi,No,,Nil,,2020-06-08,2020-06-14,Negative,2020-06-14,35.0
25855,,COVC01117,,,M,,Kenya,Nairobi,Embakasi,No,,Naivas Capital Centre,,2020-06-08,NaT,Negative,2020-06-15,30.0
564,KCSS/TTEN102/2021,COVC25043,,,,,Kenya,Unknown,Nan,No,,,,2021-10-28,2021-10-29,Negative,2021-11-01,
4113,KCSS/GGHK021/2021,COVC25058,,,,,Kenya,Unknown,Nan,No,,,,NaT,NaT,Negative,2021-11-05,


In [110]:
(df_missing[['S_NUM', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'DT_CONF']]
 .sort_values(['S_NUM', 'DT_CONF'])
 .drop_duplicates('S_NUM', keep='last')
 .to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-missing.xlsx", index=False))

### **ID Missing Results**

In [111]:
(df_dts_cln[df_dts_cln['RESULT'].isin(['Positive', 'Negative']) == 
            False][['S_NUM', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'DT_CONF', 'RESULT']]
.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-results-missing.xlsx", index=False))

In [112]:
# plt.style.available

### **Metadata + Cts**

In [113]:
df_Cts = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/runsheet-cts.xlsx")[['Sample_Name', 'Well_Position', 'Ct_Mean', 'Dt_Run']]

In [114]:
df_metCts = df_dedud.set_index('S_NUM').merge(df_Cts.set_index('Sample_Name'), how='outer', left_index=True, right_index=True).reset_index().rename(columns={'index': 'S_NUM', 'Ct_Mean': 'AVG_Ct'})

In [115]:
df_metCts.head()

Unnamed: 0,S_NUM,CASE_ID,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,Well_Position,AVG_Ct,Dt_Run
0,A1,,,,,,,,,,,,NaT,NaT,,NaT,D01,34.4,2021-06-30 18:48:52
1,C3,,,,,,,,,,,,NaT,NaT,,NaT,D03,35.0,2021-06-30 18:48:52
2,COVC00001,,35.0,F,Refugee,,Garissa,Dadaab,No,,,,NaT,NaT,Negative,2020-06-02,,,NaT
3,COVC00002,,29.0,M,Refugee,,Garissa,Dadaab,No,,,,NaT,NaT,Negative,2020-06-02,,,NaT
4,COVC00003,,50.0,M,Refugee,,Garissa,Dadaab,No,,,,NaT,NaT,Negative,2020-06-02,,,NaT


In [116]:
df_metCts.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-resultsCts-merged-cln.xlsx", index=False)

In [117]:
df_metCts.shape

(27597, 19)

In [118]:
df_metCts[df_metCts['RESULT'] == 'Positive'].to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-resultsCts-merged-cln-pos.xlsx", index=False)

#### **Confirm updated file in the following cell**

In [119]:
df_sequenced_samples = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-sequenced-samples-IDs_01-11-2021.xlsx")
seqd_list = list(df_sequenced_samples['SAMPLE'])

In [120]:
df_sequenced_samples.head()

Unnamed: 0,SAMPLE
0,COVC00854
1,COVC00867
2,COVC00893
3,COVC00915
4,COVC00962


In [121]:
df_dedud_pos = df_metCts[df_metCts['RESULT'] == 'Positive']
df_dedud_pos_unseq = df_dedud_pos[df_dedud_pos['S_NUM'].isin(seqd_list) == False]
df_dedud_pos_unseq.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-unsequenced-pos-samples_{dt}.xlsx", index=False)

In [122]:
df_dedud_pos_unseq

Unnamed: 0,S_NUM,CASE_ID,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,Well_Position,AVG_Ct,Dt_Run
138,COVC00137,,45.0,M,Refugee,,Garissa,Dadaab,No,,Surveillance,,2020-05-31,NaT,Positive,2020-06-03,D09,39.5,2020-06-03 17:04:05
155,COVC00154,,25.0,M,Refugee,,Garissa,Dadaab,No,,Surveillance,,2020-05-31,NaT,Positive,2020-06-03,F02,36.9,2020-06-03 17:04:05
486,COVC00485,KAM/MAL4342,29.0,M,Truck driver,,Nairobi,Nairobi,Yes,Nairobi,Nil,,2020-06-07,2020-06-09,Positive,2020-06-09,,,NaT
518,COVC00517,KAM/MAL4313,39.0,M,,,Mombasa,Unknown,Yes,Mombasa,Nil,,2020-06-06,2020-06-09,Positive,2020-06-09,,,NaT
520,COVC00519,KAM/MAL4308,33.0,M,,,Mombasa,Likoni,Yes,Mombasa,Nil,,2020-06-06,2020-06-09,Positive,2020-06-09,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26962,COVM01614,DRC02/BKV03931/2021,38.0,F,,Democratic Republic of the Congo,Bukavu,South kivu,No,,,,2021-04-11,2021-11-11,Positive,2021-04-11,,,NaT
26963,COVM01615,DRC02/BKV05064/2021,43.0,F,,Democratic Republic of the Congo,Bukavu,South kivu,No,,,,2021-05-14,2021-11-11,Positive,2021-05-14,,,NaT
26964,COVM01616,DRC02/BKV06029/2021,61.0,M,,Democratic Republic of the Congo,Bukavu,South kivu,No,,,"CO;H;GW;LS,FC;LT",2021-05-26,2021-11-11,Positive,2021-05-26,,,NaT
26965,COVM01629,DRC02/BKV00454/2021,35.0,M,,Democratic Republic of the Congo,Bukavu,South kivu,No,,,CO;RN;LT;GW,2021-07-07,2021-11-11,Positive,2021-07-07,,,NaT


In [123]:
df_dedud_pos = df_metCts[df_metCts['RESULT'] == 'Positive']
df_dedud_pos_seq = df_dedud_pos[df_dedud_pos['S_NUM'].isin(seqd_list) == True]
df_dedud_pos_seq.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-sequenced-pos-samples_{dt}.xlsx", index=False)

### Positives 2021 - Unsequenced

In [124]:
df_dedud_pos_unseq_21 = df_dedud_pos_unseq[df_dedud_pos_unseq['DT_CONF'] >= pd.to_datetime('2021-01-01 00:00:00')]
df_dedud_pos_unseq_21.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-unsequenced-pos-samples-2021_{dt}.xlsx", index=False)