$15^{th}$ Jul 2021

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import glob, os, re
import matplotlib.pyplot as plt
from datetime import  datetime
from ipywidgets import widgets, interactive

In [2]:
dt = datetime.today().strftime(format='%d-%m-%Y')

### *Functions*

In [3]:
def replace(string, substitutions):

    substrings = sorted(substitutions, key=len, reverse=True)
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

In [4]:
def rename_cols_subset_df(df, cols):
    df_new = df.copy()
    df_new.columns = list(map(lambda x: x.split('(')[0].strip(' _'), [col.replace(' ', '_').replace('\n', '_').upper() for col in df.columns]))
    df_cln = df_new[df_new['SAMPLE_NUMBER'].notna()]
    return df_cln[cols]

In [5]:
def my_date_parser(x):
    try:
        if type(x) == pd.Timestamp:
            x = x#.strftime('%d-%m-%Y')
        if type(x) != pd.Timestamp and type(x) != (int, float, pd.NaT):
            x = pd.to_datetime(x, errors='ignore', dayfirst=True)#.strftime('%d-%m-%Y')
        if x in ['None indicated', 'NIL', 'Not indicated on form ',
                    'Nil', 'None indicated', 'Leaked Sample - Empty', 'nan']:
            x = pd.NaT
    except (ValueError, AttributeError):
        x = pd.NaT
    return x

### **Establish parent directory**

In [6]:
#uniqueness in directory and file names is assumed for all analyses
sars_dir = "SARS-CoV-2"
home_dir = os.getenv('HOME')
parent_dir = glob.glob(f'{home_dir}/**/{sars_dir}', recursive=True)[0]

In [7]:
cols_of_interest = ['CASE_ID','SAMPLE_NUMBER', 'NAME', 'AGE', 'AGE_UNIT', 'GENDER', 'OCCUPATION', 'NATIONALITY'
           , 'COUNTY_OF_RESIDENCE', 'SUB_COUNTY_OF_RESIDENCE', 'HAS_TRAVEL_HOSTORY', 'TRAVEL_FROM'
           , 'QUARANTINE_FACILITY/HOSPITAL/HOMESTEAD', 'SYMTOMS_SHOWN', 'SAMPLE_TYPE', 'DATE_OF_SAMPLE_COLLECTION'
           , 'DATE_SAMPLE_RECEIVED_IN_THE_LAB', 'RESULT', 'LAB_CONFIRMATION_DATE', 'VACCINATION_STATUS_Y/N'
           , 'DOSAGE_C/NC']

In [8]:
files = os.listdir(glob.glob(f'{parent_dir}/**/ResultsDBCurated', recursive=True)[0])
files = sorted(files)

In [9]:
# keyerrors = []
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, 'DATE_SAMPLE_RECEIVED_IN_THE_LAB')
# #         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [10]:
# files = ['COVID-19-Test_result_Reporting_01_07_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_01_07_2020-B.xlsx',
#  'COVID-19-Test_result_Reporting_05_09_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_05_09_2020-B.xlsx',
#  'COVID-19-Test_result_Reporting_08_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_15_07_2020.xlsx',
#  'COVID-19-Test_result_Reporting_18_06_2020.xlsx',
#  'COVID-19-Test_result_Reporting_31_07_2020-A.xlsx',
#  'COVID-19-Test_result_Reporting_31_07_2020-B.xlsx']

In [11]:
# for file in files:
#     df = pd.read_excel(f'{data_dir}/{file}', header=1)
#     try:
#         df.insert(27, 'DATE SAMPLE RECEIVED IN THE LAB', pd.NaT)
#         df.to_excel(f'/home/douso/Documents/TrendData/Results/add_col/{file}', index=False, startrow=1)
#     except:
#         raise
#         print (f'Error: {file}')
#     finally: pass

In [12]:
# keyerrors = []
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, cols_of_interest)
#         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [13]:
headers = ['TESTING LAB', 'CASE ID', 'TYPE OF CASE'
       , 'SAMPLE NUMBER', 'NAME', 'ID/PASSPORT NUMBER', 'AGE'
       , 'AGE UNIT', 'GENDER', 'PHONE NUMBER'
       , 'OCCUPATION', 'NATIONALITY', 'COUNTY OF RESIDENCE'
       , 'SUB COUNTY OF RESIDENCE', 'VILLAGE/ESTATE OF RESIDENCE', 'WARD'
       , 'COUNTY OF DIAGNOSIS', 'HAS TRAVEL HOSTORY'
       , 'TRAVEL FROM', 'CONTACT WITH CASE Y/N', 'CONFIRMED CASE NAME'
       , 'QUARANTINE FACILITY/HOSPITAL/HOMESTEAD', 'HAVE SYMPTOMS Y/N'
       , 'DATE OF ONSET OF SYMPTOMS', 'SYMTOMS SHOWN'
       , 'SAMPLE TYPE', 'DATE OF SAMPLE COLLECTION', 'DATE SAMPLE RECEIVED IN THE LAB'
       , 'RESULT', 'LAB CONFIRMATION DATE', 'VACCINATION STATUS Y/N', 'DOSAGE C/NC']

In [14]:
id_headers = ['SAMPLE_NUMBER', 'NAME', 'ID/PASSPORT_NUMBER', 'AGE',
       'AGE_UNIT', 'GENDER', 'PHONE_NUMBER', 'RESULT',
       'LAB_CONFIRMATION_DATE']

In [15]:
# for file in keyerrors[:-1]:
#     df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#     df_file.columns = headers
#     df_file.to_excel(f'{data_dir}/add_col2/{file}', startrow=1)

The following 60 sheets, besides some June sheets (manually fixed earlier), lacked the `DATE_SAMPLE_RECEIVED_IN_THE_LAB` column, it was added for uniformity:

- 'COVID-19-Test_result_Reporting_01_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_01_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_02_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_03_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_04_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_05_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_06_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_06_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_07_09_2020.xlsx'
- 'COVID-19-Test_result_Reporting_08_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_08_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_09_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_09_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_10_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_11_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_12_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_13_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_13_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_14_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_14_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_15_08_2020-A.xlsx'
- 'COVID-19-Test_result_Reporting_15_08_2020-B.xlsx'
- 'COVID-19-Test_result_Reporting_16_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_16_10_2020 (1).xlsx'
- 'COVID-19-Test_result_Reporting_17_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_17_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_18_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_18_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_19_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_19_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_20_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_20_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_21_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_22_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_23_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_24_08_2020-A.xlsx'
- 'COVID-19-Test_result_Reporting_24_08_2020-B.xlsx'
- 'COVID-19-Test_result_Reporting_25_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_25_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_27_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_28_08_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_07_2020.xlsx'
- 'COVID-19-Test_result_Reporting_29_08_2020.xlsx'

In [16]:
# for file in files:
#     df = pd.read_excel(f'{data_dir}/{file}', header=1)
#     try:
#         df.insert(27, 'DATE SAMPLE RECEIVED IN THE LAB', np.nan)
#         df.to_excel(f'{data_dir}/add_col/{file}', index=False, startrow=1)
#     except:
#         raise
#         print (f'Error: {file}')
#     finally: pass

The follwing sheets have missing values for the `RESULT` column:
- 'COVID-19-Test_result_Reporting_20_06_2020.xlsx'
- 'COVID-19-Test_result_Reporting_16_06_2020.xlsx'

Updated column names of some of the Jun files ['GENDER_(M/F)', 'LAB_CONFIRMATION_DATE', 'RESULT']

In [17]:
# df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     if file.endswith('.xlsx'):
#         try:
#             df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#             df_data = rename_cols_subset_df(df_file, cols_of_interest)
#             df_raw = df_raw.append(df_data)
#             count += 1
#         except: 
#             if KeyError:
# #                 raise
#                 print(f'KeyError: {file}')
#             elif ValueError:
#                 print(f'ValueError: {file}')
#         finally: pass
#     pass
# print(f'Files processed: {count}')
# print(f'Files in the folder: {len(files)}')

### Travel History Missing - the HOSTORY misspelling

- COVID-19-Test_result_Reporting_09_09_2020.xlsx
- COVID-19-Test_result_Reporting_11_09_2020.xlsx
- COVID-19-Test_result_Reporting_12_09_2020.xlsx
- COVID-19-Test_result_Reporting_14_09_2020.xlsx
- COVID-19-Test_result_Reporting_15_09_2020.xlsx
- COVID-19-Test_result_Reporting_18_10_2020.xlsx
- Files processed: 249
- Files in the folder: 256

In [18]:
# keyerrors = []
# # df_raw = pd.DataFrame()
# count = 0
# for file in files:
#     try:
#         df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#         df_data = rename_cols_subset_df(df_file, cols_of_interest)
# #         df_raw = df_raw.append(df_data)
#         count += 1
#     except: 
#         if KeyError:
#             keyerrors.append(file)
#         elif ValueError:
#             print(f'ValueError: {file}')
#     finally: pass
# print(len(keyerrors))
# sorted(keyerrors)

In [19]:
# files = ['COVID-19-Test_result_Reporting_09_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_11_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_12_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_14_09_2020.xlsx',
#  'COVID-19-Test_result_Reporting_15_09-2020.xlsx',
#  'COVID-19-Test_result_Reporting_18_10_2020.xlsx']

In [20]:
# for file in files:
#     df_file = pd.read_excel(f'{data_dir}/{file}', header=1)
#     (df_file.rename(columns={'HAS TRAVEL HISTORY(LAST 14 DAYS) Y/N': 'HAS TRAVEL HOSTORY(LAST 14 DAYS) Y/N'}, errors='raise').
#     to_excel(f'{data_dir}/rename_col/{file}', index=False, startrow=1))

### Start of find repeat tests

### End of find repeat tests

In [21]:
id_headers = ['SAMPLE_NUMBER', 'NAME', 'AGE',
       'AGE_UNIT', 'GENDER', 'RESULT',
       'LAB_CONFIRMATION_DATE']

In [22]:
df_raw = pd.DataFrame()
count = 0
for file in files:
    if file.endswith('.xlsx'):
        try:
            df_file = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/ResultsDBCurated', recursive=True)[0]}/{file}", header=1)
            if 'VACCINATION STATUS Y/N' not in df_file.columns:
                df_file[['VACCINATION_STATUS_Y/N', 'DOSAGE_C/NC']] = [pd.NA, pd.NA]
            df_data = rename_cols_subset_df(df_file, cols_of_interest)
            df_raw = df_raw.append(df_data)
            count += 1
        except:
            if KeyError:
                print(f'KeyError: {file}')
                raise
                print(f'KeyError: {file}')
            if ValueError:
                print(f'ValueError: {file}')
#                 raise
        finally: pass
    pass
print(f'Files processed: {count}')
print(f'Files in the folder: {len(files)}')

  warn(msg)


Files processed: 329
Files in the folder: 329


In [23]:
# df_raw.head()

### **Clean Headers**

In [24]:
cols_rename = ['CASE_ID', 'S_NUM', 'AGE', 'AGE_UNIT', 'GEND', 'OCCU', 'NAT', 'COUNT_RES', 'S_COUNT_RES', 'TRAV_HIST', 
                    'TRAV_FROM', 'QUAR_PLACE', 'SYMPS', 'SAMP_TYPE', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'RESULT', 'DT_CONF', 'VACC_STATUS', 'VACC_DOS']

In [25]:
# df_raw.columns = cols_rename

### **Clean Sample Numbers**

In [26]:
def cov_names(x):
    x = x.upper().replace(' ', '')
    if len(x) == 8:
        x = x.replace('COVC', 'COVC0')
    elif len(x) == 7:
        x = x.replace('COVC', 'COVC00')
    elif len(x) == 6:
        x = x.replace('COVC', 'COVC000')
    elif len(x) == 5:
        x = x.replace('COVC', 'COVC0000')
    return x

In [27]:
cols_of_interest2 = ['CASE_ID', 'S_NUM', 'NAME', 'AGE', 'AGE_UNIT', 'GEND', 'OCCU', 'NAT', 'COUNT_RES', 'S_COUNT_RES', 'TRAV_HIST', 
                    'TRAV_FROM', 'QUAR_PLACE', 'SYMPS', 'SAMP_TYPE', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'RESULT', 'DT_CONF', 'VACC_STATUS', 'VACC_DOS']

In [28]:
df_raw.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged.xlsx", 
              index=False, header=cols_of_interest2)

In [29]:
df_mrg_mtdata = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged.xlsx", usecols=cols_rename)

### **Clean Sample Name**

In [30]:
df_name = df_mrg_mtdata.assign(S_NUM=df_mrg_mtdata['S_NUM'].apply(lambda x: cov_names(x)))

In [31]:
#df_name[df_name['S_NUM'].str.len() < 8]

In [32]:
#df_name[df_name['S_NUM'].str.contains('COVC') == False]

### **Clean Age**

In [33]:
#df_name[df_name['AGE_UNIT'] == 'Days']

In [34]:
df_months = df_name[df_name['AGE_UNIT'] == 'Months']

In [35]:
#df_months.head()

In [36]:
df_mn2yrs = df_months.assign(AGE=df_months['AGE'].map(lambda x: round(x / 12, 1))).replace('Months', 'Years')

In [37]:
#df_mn2yrs.head()

In [38]:
#df_name[df_name['AGE_UNIT'] == 'Days']#.AGE_UNIT.unique()

In [39]:
df_not_months = df_name[df_name['AGE_UNIT'] != 'Months']

In [40]:
df_not_months.shape

(28461, 20)

In [41]:
df_mn2yrs.shape

(77, 20)

In [42]:
df_years = df_not_months.append(df_mn2yrs)

In [43]:
df_years.shape

(28538, 20)

In [44]:
df_years['AGE_UNIT'].unique()

array(['Years', nan, 'YEARS', 'years', 'Nil', 'Days'], dtype=object)

In [45]:
#df_years[df_years['AGE_UNIT'] == 'NO PATIENT DATASHEET']

In [46]:
#df_years[df_years['AGE_UNIT'].str.contains('M', 'F') == True]

In [47]:
#df_years[df_years['AGE_UNIT'] == 'Nil']

In [48]:
df_years_drop_age_unit = df_years.drop('AGE_UNIT', axis=1)

In [49]:
df_years_rencol = df_years_drop_age_unit.rename(columns={'AGE': 'AGE_YRS'})

In [50]:
#df_years_rencol.head()

In [51]:
df_years_rencol['AGE_YRS'].unique()

array([38, 48, 34, 29, 46, 33, 36, 52, 50, 32, 45, 55, 22, 42, 37, 26, 18,
       53, nan, 28, 41, 67, 89, 72, 63, 76, 59, 58, 82, 57, 65, 49, 14,
       27, 35, 54, 61, 25, 23, 40, 7, 43, 64, 31, 24, 39, 20, 68, 30, 13,
       51, 21, 47, 17, 73, 44, 8, 19, 66, 16, 60, 10, 9, 6, 56, 15, 5, 12,
       84, 62, 312, 69, 79, 70, 2, 86, 80, 11, 88, 3, 4, 74, 87, 3.6, 1,
       1.5, 2.42, 'None indicated', 6.5, 2.5, 75, 97, 'Nil', 71, 96, 77,
       'Not indicated', 743, 78, 'None', 9.5, 83, 'nil', 81, 98,
       'none indicated', 90, 85, 91, 100, 101, 92, 'not_indicated', 5.5,
       'Not Indicated', '37', '36', '40', '38', '29', 94, 105, 0.5,
       'not Indicated', '30', '44', '33', '27', '26', '51', '28',
       'None Indicated', 542, 0.8, 0.9, 0.1, 1.6, 1.1, 2.9, 3.7, 0.2, 1.7,
       0.3, 2.8, 0.7, 1.2, 2.1, 0.4, 4.5, 1.8, 0.6], dtype=object)

In [52]:
[x for x in df_years_rencol['AGE_YRS'].unique() if isinstance(x, str)]

['None indicated',
 'Nil',
 'Not indicated',
 'None',
 'nil',
 'none indicated',
 'not_indicated',
 'Not Indicated',
 '37',
 '36',
 '40',
 '38',
 '29',
 'not Indicated',
 '30',
 '44',
 '33',
 '27',
 '26',
 '51',
 '28',
 'None Indicated']

In [53]:
to_rep = ['None indicated',
 'Nil',
 'Not indicated',
 'None',
 'nil',
 'none indicated',
 'not_indicated',
 'Not Indicated',
 'not Indicated',
 'None Indicated',
 'NO PATIENT DATASHEET'
]

In [54]:
df_years_repnan2float = df_years_rencol.assign(AGE_YRS=df_years_rencol['AGE_YRS'].replace(to_rep, np.nan).astype(float))

In [55]:
#df_years_repnan2float

### **Clean Gender**

In [56]:
df_years_repnan2float['GEND'].unique()

array(['M', 'F', nan, 'f', 'Female', 'Male', 'F`', 'MALE ', 'MALE', 'Nil',
       'T', 'm'], dtype=object)

In [57]:
#df_years_repnan2float[df_years_repnan2float['GEND'].isna()].sort_values('S_NUM')

In [58]:
#df_years_repnan2float[df_years_repnan2float['GEND'] == 'Nil']

COVEs:
- COVC13844
- COVC13845
- COVC13846
- COVC13847
- COVC13848
- COVC13849
- COVC13850
- COVC13851
- COVC13852
- COVC13853
- COVC16256
- COVC16257

In [59]:
df_years_repgender = df_years_repnan2float.assign(GEND=df_years_repnan2float['GEND'].replace(['f', 'Female', 'F`', 'T'], 'F')
                                                 .replace(['Male', 'MALE ', 'MALE'], 'M'))

In [60]:
df_years_repgender['GEND'].unique()

array(['M', 'F', nan, 'Nil', 'm'], dtype=object)

In [61]:
df_years_repgender['GEND'].replace('Nil', pd.NA, inplace=True)

In [62]:
df_years_repgender['GEND'].unique()

array(['M', 'F', nan, <NA>, 'm'], dtype=object)

### **Clean Occupation**

In [63]:
df_years_repgender['OCCU'].unique()

array(['Business', nan, 'Banker', ..., 'Logistician', 'Hotel',
       'Child accompanying inmate mothe'], dtype=object)

In [64]:
df_years_repgender['OCCU'] = df_years_repgender['OCCU'].replace(['Nil','nil'], pd.NA)

In [65]:
# series = pd.Series(df_years_repgender['OCCU'].str.strip().str.lower().unique()).to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-occupations.xlsx", index=False)
# series

### **Clean Nationality**

In [66]:
df_years_repgender['NAT'].unique()

array(['Zimbabwe', 'Kenya', 'Tanzania', 'South Africa',
       'United Kingdom (UK)', 'Botswana',
       'United States of America (USA)', 'Uganda', 'Netherlands',
       'Congo, Democratic Republic of the', 'Burundi', 'Kenyan',
       'South Sudan', nan, 'India', 'Italy', 'China', 'Cameroon',
       'Ethiopia', 'Somalia', 'Canada', 'Zambia', 'Germany', 'Sudan',
       'Lebanon', 'Sweden', 'Nigeria', 'Benin', 'Cuba', 'Indian', 'Ghana',
       'Rwanda', 'Mauritania', 'Denmark', 'Pakistan', 'Brazil',
       'Tanzanian', 'Australia', 'Colombia', 'Hungary', 'Djibouti',
       'Finland', 'Senegal', 'kenya', 'Yemen', 'Turkey', 'Philippines',
       'Japan', 'Ireland', 'Tunisia', 'Switzerland', 'Togo',
       'Burkina Faso', 'Portugal', 'Bangladesh', 'Madagascar',
       "Cote d'Ivoire", 'Egypt', 'KENYAN', 'Greece', 'Bulgaria', 'Nil',
       'Mozambique', 'Mali', 'Eritrea', 'Austria', 'Malawi', 'France',
       'Namibia', 'Congo, Republic of the', 'Spain', 'Sri Lanka',
       'United States',

In [67]:
df_years_repnat = df_years_repgender.assign(NAT=df_years_repgender['NAT'].replace('Nil', 'Unknown')
                                            .replace(['United States', 'Indian'
                                                      , 'Congo, Democratic Republic of the'
                                                     , 'Congo, Republic of the'] 
                                                     , ['United States of America (USA)','India'
                                                        , 'Democratic Republic of the Congo',
                                                       'Republic of the Congo'])
                                            .map(lambda x: str(x).replace(x, 'Kenya') if ('enya' in str(x).lower()) else x))

In [68]:
df_years_repnat['NAT'].unique()

array(['Zimbabwe', 'Kenya', 'Tanzania', 'South Africa',
       'United Kingdom (UK)', 'Botswana',
       'United States of America (USA)', 'Uganda', 'Netherlands',
       'Democratic Republic of the Congo', 'Burundi', 'South Sudan', nan,
       'India', 'Italy', 'China', 'Cameroon', 'Ethiopia', 'Somalia',
       'Canada', 'Zambia', 'Germany', 'Sudan', 'Lebanon', 'Sweden',
       'Nigeria', 'Benin', 'Cuba', 'Ghana', 'Rwanda', 'Mauritania',
       'Denmark', 'Pakistan', 'Brazil', 'Tanzanian', 'Australia',
       'Colombia', 'Hungary', 'Djibouti', 'Finland', 'Senegal', 'Yemen',
       'Turkey', 'Philippines', 'Japan', 'Ireland', 'Tunisia',
       'Switzerland', 'Togo', 'Burkina Faso', 'Portugal', 'Bangladesh',
       'Madagascar', "Cote d'Ivoire", 'Egypt', 'Greece', 'Bulgaria',
       'Unknown', 'Mozambique', 'Mali', 'Eritrea', 'Austria', 'Malawi',
       'France', 'Namibia', 'Republic of the Congo', 'Spain', 'Sri Lanka',
       'Jordan'], dtype=object)

### **Clean County of Res**

In [69]:
#df_years_repnat.head()

In [70]:
counties = (df_years_repnat['COUNT_RES']
                   .replace(['Nil', 'nan', '', 'Nan', 'Muranga ', 'Murang`a ', 'Niarobi']
                            , [pd.NA, pd.NA, pd.NA, pd.NA, "Murang'a", "Murang'a", 'Nairobi'])
                   .apply(lambda x: str(x).strip().capitalize() if (isinstance(x, str)) else x)).fillna(pd.NA)

In [71]:
df_years_county = (df_years_repnat.assign(COUNT_RES=counties))#.assign(S_COUNT_RES=s_counties)

In [72]:
df_years_county['COUNT_RES'].unique()

array(['Nairobi', 'Kirinyaga', 'Kakamega', 'Vihiga', 'Siaya', 'Kajiado',
       'Mombasa', 'Migori', <NA>, 'Nakuru', 'Kiambu', 'Bungoma',
       'West pokot', 'Trans nzoia', "Murang'a", 'Busia', 'Kisii',
       'Kilifi', 'Machakos', 'Laikipia', 'Marsabit', 'Makueni', 'Isiolo',
       'Mwanza', 'Zambia', 'Burundi', 'Bukoba', 'Tanzania', 'Nyamira',
       'Garissa', 'Homabay', 'Samburu', 'Nyeri', 'Tana river', 'Kitui',
       'Wajir', 'Nyandarua', 'Meru', 'Uasin gishu', 'Nandi', 'Kwale',
       'Uganda', 'Mandera', 'Kampala', 'Kisumu', 'Badiere', 'Embu',
       'Kigali', 'Mukono/uganda', 'Congo', 'Narok', 'Kericho',
       'Elgeyo marakwet', 'Taita taveta', 'Bomet', 'Bukavu', 'Thika',
       'Tharakanithi'], dtype=object)

### **Clean Subcounty of Res**

In [73]:
df_years_repnat['S_COUNT_RES'].str.lower().nunique()

976

In [74]:
len((df_years_repnat['S_COUNT_RES'].fillna('NA')
     .replace(['Nil', 'Nil ', 'Muranga ', 'Murang`a ']
                            , [pd.NA, pd.NA, "Murang'a", "Murang'a"])
                   .apply(lambda x: str(x).strip().capitalize())
     .unique()))

898

In [75]:
s_counties = (df_years_repnat['S_COUNT_RES']
     .replace(['Nil', 'Nil ', 'Muranga ', 'Murang`a ']
                            , [pd.NA, pd.NA, "Murang'a", "Murang'a"])
                   .apply(lambda x: str(x).replace(str(x), 'NA') if ('indicate' in str(x)) else str(x).strip().capitalize())
              .fillna(pd.NA))

In [76]:
# sorted([s for s in s_counties.unique()])

In [77]:
df_years_scounty = df_years_county.assign(S_COUNT_RES=s_counties)

### **Clean Travel History**

In [78]:
#df_years_scounty.head()

In [79]:
df_years_scounty['TRAV_HIST'].unique()

array(['No', 'Yes', nan, 'Y', 'N', 'JH', 'no', 'NO ', 'Nil', 'YES',
       'NONE ', 'NONE', 'y', 'yes', 15.93], dtype=object)

In [80]:
df_travl = df_years_scounty.assign(TRAV_HIST=df_years_scounty['TRAV_HIST'].replace([pd.NA, 'N', 'no', 'NO', 'Nil', 'NONE ', 'NONE'], 'No')
           .replace(['YES', 'Y', 'y', 'yes'], 'Yes'))


In [81]:
df_travl['TRAV_FROM'].unique()

array([nan, 'South Africa', 'KITUI;NAIROBI', 'KISUMU', 'UGANDA',
       'KISUMU;KITUI', 'NAIROBI', 'NAKURU;TURKANA', 'SAUDI ARABIA',
       'ITEN-KENDUR', 'BUNGOMA', 'EMBU', 'cuba', 'ELDORET', 'TANZANIA',
       'Kisii', 'Nyamira', 'Nairobi',
       'Kendu-Bay;Kisii;Kisumu;Homabay;Oyugis', 'Mumias, Oyugis, Homabay',
       'NAIVASHA', 'NAIROBI;NYERI;NANYUKI', 'NIL', 'Ruaka', 'Ngoingwa',
       'Limuru', 'Thika', 'Embakasi', 'Uganda', 'Dubai', 'Nil',
       'Not indicated on form ', 'TORORO UGANDA', 'AWASI', 'KIGALI',
       'ENTEBE', 'JINJA', 'KAMPALA', 'KASESE', 'KILIFI COUNTY',
       'KWALE COUNTY', 'Mwanza', 'Mombasa', 'Geita', 'Isebania',
       'Kilimanjaro', 'Migori', 'KENDU BAY', 'KISUMU, KITALE', 'HOMABAY',
       'KISII, KISUMU', 'HOMABAY, KISII', 'NYAHURURU; NANYUKI', 'TORORO',
       'KAMPALA UGANDA', 'RWANDA', 'JINJA UGANDA', 'KAJIADO',
       'NOT APPLICABLE ', 'SOUTH SUDAN ', 'NYERI', 'NAIROBI,KISUMU,AWASI',
       'OYUGIS', 'KWALE;KILIFI', 'KAKAMEGA', 'KWALE', 'Eldoret'

In [82]:
df_travl2 = (df_travl.assign(TRAV_FROM=df_travl['TRAV_FROM'].replace(['nan','NIL', 'Nil', 'Not indicated on form '
                  , 'NOT APPLICABLE ', 'N', 'N', 'No', 'not_indicated', 'Not applicable', 'Not indicated'],pd.NA)
                    .replace(['Mombasa (stayed in mtwapa for 4 days)', 'KINANGO MSAMBWENI', 'Nan'
                              , 'Mombasa (Stayed in Mtwapa for 4 days)'], 
                    ['Mtwapa', 'Kinangop;Msambweni', pd.NA, 'Mtwapa'])
                    .apply(lambda x: str(x).replace('/', ';')
                    .replace(' and ', ';')
                    .replace('-', ';')
                    .replace(',', ';')
                    .replace(' ; ', ';')
                    .replace('; ', ';')
                    .strip().capitalize() if (isinstance(x, str)) else x)))

In [83]:
df_travl2['TRAV_FROM'].unique()



array([nan, 'South africa', 'Kitui;nairobi', 'Kisumu', 'Uganda',
       'Kisumu;kitui', 'Nairobi', 'Nakuru;turkana', 'Saudi arabia',
       'Iten;kendur', 'Bungoma', 'Embu', 'Cuba', 'Eldoret', 'Tanzania',
       'Kisii', 'Nyamira', 'Kendu;bay;kisii;kisumu;homabay;oyugis',
       'Mumias;oyugis;homabay', 'Naivasha', 'Nairobi;nyeri;nanyuki', <NA>,
       'Ruaka', 'Ngoingwa', 'Limuru', 'Thika', 'Embakasi', 'Dubai',
       'Tororo uganda', 'Awasi', 'Kigali', 'Entebe', 'Jinja', 'Kampala',
       'Kasese', 'Kilifi county', 'Kwale county', 'Mwanza', 'Mombasa',
       'Geita', 'Isebania', 'Kilimanjaro', 'Migori', 'Kendu bay',
       'Kisumu;kitale', 'Homabay', 'Kisii;kisumu', 'Homabay;kisii',
       'Nyahururu;nanyuki', 'Tororo', 'Kampala uganda', 'Rwanda',
       'Jinja uganda', 'Kajiado', 'South sudan', 'Nyeri',
       'Nairobi;kisumu;awasi', 'Oyugis', 'Kwale;kilifi', 'Kakamega',
       'Kwale', 'Malindi', 'Uasin gishu', 'Machakos', 'Kilifi;nairobi',
       'Nanyuki sagana', 'Tharaka nithi',

In [84]:
#df_travl2.head()

### **Clean Symptoms**

Symptoms legend
- GW: General (body) weakness/fatigue/malaise
- FC: Fever/Chills
- CO: Cough
- ST: Sore throat
- RN: Runny nose
- SB: Shortness of breath/Difficulty in breathing
- D: Diarrhoea
- NV: Nausea/Vomiting
- H: Headache
- IC: Irritability/Confusion
- P: Pain
- LS: Loss of smell
- LT: Loss of taste/appetite
- P-M: Muscular pain
- P-A: Abdominal pain
- P-B: Back pain
- P-C: Chest pain
- P-J: Joint pain
- BA: Body aches
- SZ: Sneezing
- HP: Hypertension
- TN: Tonsillitis
- DZ: Dizziness

In [85]:
df_travl2['SYMPS'] = (df_travl2['SYMPS'].apply(lambda x: str(x).replace(str(x), 'NA') if ('symtom' in str(x)) else str(x).replace('F/C', 'FC'))
                      .replace(['[=]','nan'], 'NA'))
pd.Series(df_travl2['SYMPS']).to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-symptoms.xlsx", index=False)
df_symp = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-symptoms_curation.xlsx")
symp_dict = {str(k): str(v) for k,v in zip(df_symp['STRING'], df_symp['REPLACE'])}

df_symps = df_travl2.assign(SYMPS=df_travl2['SYMPS'].apply(lambda x: replace(str(x), symp_dict)))

### **Clean Sample Type**

In [86]:
df_symps.SAMP_TYPE.unique()
df_samp_ty = df_symps.assign(SAMP_TYPE=df_symps['SAMP_TYPE'].replace(['NP and OP', 'NP-OP +AB2:AL2Swab',
       'NP-OP Swab', 'nP Swab', 'NP OP Swab', 'NP&OP'], ['NP & OP Swab', 'NP & OP Swab', 'NP & OP Swab', 'NP Swab', 'NP & OP Swab', 'NP & OP Swab']))

### **Clean Dates**

In [87]:
df_dt1 = df_samp_ty.assign(DT_SAM_COLL=df_samp_ty['DT_SAM_COLL'].apply(lambda x: my_date_parser(x)))

In [88]:
df_dt2 = df_dt1.assign(DT_SAM_RECEP=df_dt1['DT_SAM_RECEP'].apply(lambda x: my_date_parser(x)))

In [89]:
df_dts_cln = df_dt2.assign(DT_CONF=df_dt2['DT_CONF'].apply(lambda x: my_date_parser(x)))

In [90]:
#df_dts_cln.head()

### **Clean Results**

In [91]:
df_dts_cln['RESULT'].unique()

array([nan, 'Positive', 'POSITIVE', 'Ag Positive', 'Negative', 'positive',
       'neg', 'pos', 'ms', 'negative', 'Neg', 'Pos', 'MS', 'Positve',
       'Positive ', 'M2000'], dtype=object)

In [92]:
df_dts_cln['RESULT'].map(lambda x: x.strip().lower() if (isinstance(x, str)) else x).unique()

array([nan, 'positive', 'ag positive', 'negative', 'neg', 'pos', 'ms',
       'positve', 'm2000'], dtype=object)

In [93]:
def cln_results(x):
    if isinstance(x, str):
        x = x.strip().lower()
        if 'neg' in x:
            x = x.replace(x, 'Negative')
        elif 'pos' in x:
            x = x.replace(x, 'Positive')
        elif x == 'inconclusive':
            x = x.replace(x, 'Negative')
    return x
        

In [94]:
df_dts_cln['RESULT'] = df_dts_cln['RESULT'].map(lambda x: cln_results(x))

In [95]:
df_dts_cln['RESULT'].unique()

array([nan, 'Positive', 'Negative', 'ms', 'm2000'], dtype=object)

In [96]:
#df_dts_cln.head(2)

### **Remove Duplicates**

In [97]:
df_cln_srt = df_dts_cln.sort_values(['S_NUM', 'DT_CONF'])

In [98]:
#df_cln_srt[df_cln_srt.duplicated(['S_NUM', 'DT_CONF'], keep=False) == True]

In [99]:
#df_cln_srt[df_cln_srt.duplicated(keep=False) == True]

In [100]:
df_dedud = df_cln_srt.drop_duplicates(['S_NUM', 'DT_CONF'], keep='first')#['COUNT_RES'].fillna('Unknown', inplace=True)

### **QC Dates**

In [101]:
sr_date_diff = df_dedud['DT_CONF'] - df_dedud['DT_SAM_COLL']

In [102]:
mask1 = sr_date_diff >= '60D'
mask2 = sr_date_diff != pd.NaT
mask3 = df_dedud['DT_SAM_COLL'] > df_dedud['DT_CONF']
# mask3 = df_dedud['DT_CONF']

In [103]:
df_dedud[mask1 == True]

Unnamed: 0,CASE_ID,S_NUM,AGE_YRS,GEND,OCCU,NAT,COUNT_RES,S_COUNT_RES,TRAV_HIST,TRAV_FROM,QUAR_PLACE,SYMPS,SAMP_TYPE,DT_SAM_COLL,DT_SAM_RECEP,RESULT,DT_CONF,VACC_STATUS,VACC_DOS
11152,047/NTRH/005,COVC06893,50.0,F,Health Care Worker,Kenya,Nairobi,Mathare,No,,None Indicated,,NP Swab,2020-05-10,NaT,Negative,2020-08-12,,
11440,NMS/EM/100-240,COVC09826,30.0,M,PLUMBER,Kenya,Kwale,Samburu,No,,,,NP Swab,2020-10-09,2020-11-09,Negative,2020-12-09,,
11441,NMS/EM/100-241,COVC09827,39.0,F,BUSSINESS,Kenya,Kiambu,Kabete,No,,,,NP Swab,2020-10-09,2020-11-09,Negative,2020-12-09,,
11442,NMS/EM/100-517,COVC09828,24.0,M,BUSSINESS,Kenya,Nairobi,Iindustrial area,No,,,,NP Swab,2020-10-09,2020-11-09,Negative,2020-12-09,,
11443,NMS/EM/100-516,COVC09829,25.0,M,BUSSINESS,Kenya,Nairobi,Kariobangi south,No,,,,NP Swab,2020-10-09,2020-11-09,Negative,2020-12-09,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,KCGH/4268/2021,COVM01663,82.0,M,Farmer,Kenya,Kakamega,Lurambi,No,,,CO;FC;P-C;LS;LT,NP Swab,2021-09-29,2021-12-08,Positive,2021-12-09,N,NC
44,KCGH/4293/2021,COVM01664,82.0,F,Farmer,Kenya,Vihiga,Hamisi,No,,,FC;CO;LS;LT,NP Swab,2021-09-30,2021-12-08,Positive,2021-12-09,N,NC
45,KCGH/4337/2021,COVM01665,57.0,F,HCW,Kenya,Kakamega,Lurambi,No,,,FC;CO;H;P-J;LS;LT,NP Swab,2021-10-04,2021-12-08,Positive,2021-12-09,N,NC
46,KCGH/4440/2021,COVM01666,65.0,M,Farmer,Kenya,Kakamega,Lurambi,No,,,CO;P-C;LS;LT,NP Swab,2021-10-07,2021-12-08,Positive,2021-12-09,N,NC


### **Export Cleaned Data**

In [104]:
df_dedud.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-cln.xlsx", index=False, na_rep='NA')

In [105]:
df_dedud[df_dedud['RESULT'] == 'Positive'].to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-cln-pos.xlsx", index=False, na_rep='NA')

In [106]:
#df_dedud[df_dedud['S_NUM'].str.contains('COVC') == False]

In [107]:
#df_dedud[df_dedud['S_NUM'].str.contains('COVC') == False]

df_symps11-09-2020

COVC9774
COVC9775
COVE20
COVE21
COVC9776
COVC9777


### **ID Misssing Data**

In [108]:
start = 0
df_missing = pd.DataFrame(columns=cols_rename)
for num in df_dedud['S_NUM'].sort_values():
#     curr_row = df_dts_cln[df_dts_cln['S_NUM']]
    try:
        curr = int(num.lstrip('COVC').lstrip('0'))
        
        if curr - start > 1:
            start_id = f'COVC{start}'
            cov_id = f'COVC{curr}'
            df_missing = df_missing.append(df_dts_cln[df_dts_cln['S_NUM'] == cov_names(start_id)])
            df_missing = df_missing.append(df_dts_cln[df_dts_cln['S_NUM'] == cov_names(cov_id)])
        start = curr
    except ValueError:
#         print(f'ValueError: {num}')
        pass

In [109]:
#df_missing.head()

In [110]:
(df_missing[['S_NUM', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'DT_CONF']]
 .sort_values(['S_NUM', 'DT_CONF'])
 .drop_duplicates('S_NUM', keep='last')
 .to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-missing.xlsx", index=False, na_rep='NA'))

### **ID Missing Results**

In [111]:
(df_dts_cln[df_dts_cln['RESULT'].isin(['Positive', 'Negative']) == 
            False][['S_NUM', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'DT_CONF', 'RESULT']]
.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-results-merged-results-missing.xlsx", index=False, na_rep='NA'))

In [112]:
# plt.style.available

### **Metadata + Cts**

In [113]:
df_Cts = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/runsheet-cts.xlsx")[['Sample_Name', 'Well_Position', 'Ct_Mean', 'Dt_Run']]

In [114]:
df_metCts = df_dedud.set_index('S_NUM').merge(df_Cts.set_index('Sample_Name'), how='left', left_index=True, right_index=True).reset_index().rename(columns={'index': 'S_NUM', 'Ct_Mean': 'AVG_Ct'})

In [115]:
#df_metCts.head()

In [116]:
df_metCts.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-resultsCts-merged-cln.xlsx", index=False, na_rep='NA')

In [117]:
df_metCts.shape

(28132, 22)

In [118]:
df_metCts[df_metCts['RESULT'] == 'Positive'].to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/COVID19-resultsCts-merged-cln-pos.xlsx", index=False)

#### **Confirm updated file in the following cell**

In [119]:
df_sequenced_samples = pd.read_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-sequenced-samples-IDs_01-11-2021.xlsx")
seqd_list = list(df_sequenced_samples['SAMPLE'])

In [120]:
df_dedud_pos = df_metCts[df_metCts['RESULT'] == 'Positive']
df_dedud_pos_unseq = df_dedud_pos[df_dedud_pos['S_NUM'].isin(seqd_list) == False]
df_dedud_pos_unseq.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-unsequenced-pos-samples_{dt}.xlsx", index=False, na_rep='NA')

In [121]:
#df_dedud_pos_unseq

In [122]:
df_dedud_pos = df_metCts[df_metCts['RESULT'] == 'Positive']
df_dedud_pos_seq = df_dedud_pos[df_dedud_pos['S_NUM'].isin(seqd_list) == True]
df_dedud_pos_seq.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-sequenced-pos-samples_{dt}.xlsx", index=False, na_rep='NA')

### Positives 2021 - Unsequenced

In [123]:
df_dedud_pos_unseq_21 = df_dedud_pos_unseq[df_dedud_pos_unseq['DT_CONF'] >= pd.to_datetime('2021-01-01 00:00:00')]
df_dedud_pos_unseq_21.to_excel(f"{glob.glob(f'{parent_dir}/**/Outputs', recursive=True)[0]}/all-unsequenced-pos-samples-2021_{dt}.xlsx", index=False, na_rep='NA')