In [1]:
with open('config/pg_conn.txt') as conn_details:
    conn_str_psyco = conn_details.readline()
    conn_str_sqlalchemy = conn_details.readline()

In [40]:
import sys
sys.executable

'/mnt/c/Users/harry/Documents/github/DHS-DataExtraction/dhs_env/bin/python3'

In [41]:
import pandas as pd
import pandas.io.sql as psql
import psycopg2 as pg
from sqlalchemy import create_engine
import sqlalchemy as sa
import os
import glob

In [2]:
SPEC_SCHEMA = 'dhs_survey_specs'
DATA_SCHEMA = 'dhs_data_tables'

TABLESPEC_TABLENAME = 'table_specs_blank' #'dhs_table_specs_flat'
VALUESPEC_TABLENAME = 'value_specs_blank' #'dhs_value_descs'
SURVEYLIST_TABLENAME = 'dhs_survey_listing_backup'

TABLE_SPEC_TABLE = ".".join((SPEC_SCHEMA, TABLESPEC_TABLENAME))
VALUE_SPEC_TABLE = ".".join((SPEC_SCHEMA, VALUESPEC_TABLENAME))
SURVEYLIST_TABLE = ".".join((SPEC_SCHEMA, SURVEYLIST_TABLENAME))

STAGING_FOLDER = "/mnt/d/InformalCities/DHS_Updates/staging"

In [43]:
engine = create_engine(conn_str_sqlalchemy)

# Part 1 - refresh DB cache of available surveys

Load the list of all surveys from the DHS API. Note that in the past, this has been paginated, necessitating multiple calls to load it all. At present this isn't the case. As at 2021-01-08 there are 444 rows returned, so if you note less than this then it's likely pagination has been re-implemented.

In [33]:
api_survey_listing = pd.read_csv('http://api.dhsprogram.com/rest/dhs/surveys?f=csv&surveyStatus=all&perpage=2000')
api_survey_listing.columns = api_survey_listing.columns.str.lower()
api_survey_listing.head()

Unnamed: 0,surveyid,dhs_countrycode,surveyyear,surveytype,surveystatus,countryname,surveyyearlabel,surveynum,indicatordata,regionname,...,numberofhouseholds,universeofwomen,numberofwomen,minagewomen,maxagewomen,universeofmen,numberofmen,minagemen,maxagemen,numberoffacilities
0,AF2010OTH,AF,2010,OTH,Completed,Afghanistan,2010,348,0,South & Southeast Asia,...,22351.0,Ever Married Women,47848.0,12.0,49.0,,,,,
1,AF2015DHS,AF,2015,DHS,Completed,Afghanistan,2015,471,1,South & Southeast Asia,...,24395.0,Ever Married Women,29461.0,15.0,49.0,Ever Married Men,10760.0,15.0,49.0,
2,AF2018SPA,AF,2018,SPA,Completed,Afghanistan,2018-19,543,0,South & Southeast Asia,...,,,,,,,,,,142.0
3,AF2020DHS,AF,2020,DHS,Ongoing,Afghanistan,2020,568,0,South & Southeast Asia,...,33120.0,Ever Married Women,37146.0,15.0,49.0,Ever Married Men,13672.0,15.0,49.0,
4,AL2008DHS,AL,2008,DHS,Completed,Albania,2008-09,327,1,North Africa/West Asia/Europe,...,7999.0,All Women,7584.0,15.0,49.0,All Men,3013.0,15.0,49.0,


In [31]:
our_survey_listing = pd.read_sql(f'SELECT * from {SURVEYLIST_TABLE}', con=engine)

See if there's any in the API results that aren't in our DB copy of the table

In [32]:
api_survey_listing[~api_survey_listing['surveynum'].isin(our_survey_listing['surveynum'])]


Unnamed: 0,surveyid,dhs_countrycode,surveyyear,surveytype,surveystatus,countryname,surveyyearlabel,surveynum,indicatordata,regionname,...,numberofhouseholds,universeofwomen,numberofwomen,minagewomen,maxagewomen,universeofmen,numberofmen,minagemen,maxagemen,numberoffacilities


If there's more in the API than our copy of the table, just drop and reload the whole table.

In [30]:
if len(api_survey_listing) > len(our_survey_listing):
    api_survey_listing.to_sql(name=SURVEYLIST_TABLENAME, con=engine, schema=SPEC_SCHEMA, 
                     index=False, if_exists='replace', method='multi')

# Part 2 - Update the database survey metadata tables

The database has one metadata table which records all the survey data tables and what columns they have (per survey). This contains the values from the parsed "FlatRecordSpec" csv files.

Another metadata table records all the values that all the survey data table columns can take, and (in the case of coded values) what the numeric values mean. This contains the values from the parsed "FlatValuesSpec" files.

For every CSV FlatRecordSpec and FlatValuesSpec file, we need to check if it's in the DB table at all (and load if not). If it is already there, then check if the info in the latest CSV matches what's in the DB. (As e.g. a re-released survey may change the specified width of some columns, or add some new data tables equating to extra metadata rows).

In [3]:
from lib03_Update_Metadata import SurveyMetadataHelper

In [4]:
db_helper = SurveyMetadataHelper(conn_str=conn_str_sqlalchemy, table_spec_table=TABLESPEC_TABLENAME,
                             value_spec_table=VALUESPEC_TABLENAME, spec_schema=SPEC_SCHEMA)

Get a simple list of all the surveys that appear in the database metadata tables at all

In [5]:
existing_table_svys = db_helper.get_existing_table_surveys()
existing_value_svys = db_helper.get_existing_value_surveys()

In [6]:
existing_table_svys['surveyid'].values[:10]

array(['462', '497', '521', '383', '13', '93', '391', '94', '34', '115'],
      dtype=object)

Get a listing of all the up-to-date CSV table and value specification files we've parsed

In [36]:
spec_csv_dir = os.path.join(STAGING_FOLDER, 'parsed_specs')

tbl_files = glob.glob(os.path.join(spec_csv_dir, '*.FlatRecordSpec.csv'))
val_files = glob.glob(os.path.join(spec_csv_dir, '*.FlatValuesSpec.csv'))


In [11]:
tbl_files[:5]

['/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/1.ESIR01.FlatRecordSpec.csv',
 '/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/100.VNIR31.FlatRecordSpec.csv',
 '/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/101.CMIR31.FlatRecordSpec.csv',
 '/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/101.CMMR31.FlatRecordSpec.csv',
 '/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/105.MZIR31.FlatRecordSpec.csv']

Each survey may have more than one file. This is normal if one is "individual recode" and another is "men's recode", but if there's some other reason then we need to investigate.

Create a list of all the files for each (numerically-identified) survey id

In [None]:
tbl_files_dict = build_spec_files_dict(tbl_files)
val_files_dict = build_spec_files_dict(val_files)

Check we don't have anything we weren't expecting

In [15]:
multiple_locs = set()
multiple_versions_per_filetype = set() # TODO
unknown_filetypes = set()
multiple_files_per_filetype = set()

for surveyid, dict_type_filenames in tbl_files_dict.items():
    survey_loc = None
    survey_version = {'ir':None, 'mr':None}
    types_this_survey = set()
    for filetype, fn_ver_locs in dict_type_filenames.items():
        
        # check we don't have anything except for IR and MR datasets
        if filetype not in ['ir', 'mr']:
            unknown_filetypes.add(surveyid)
        
        # check we don't have more than one IR or MR per survey
        # if we do, then it's a new version
        if len(fn_ver_locs)>1:
            multiple_files_per_filetype.add(surveyid)
        for fn, new_version, new_loc in fn_ver_locs:
            # check for location being unique across IR and MR files
            if survey_loc is None:
                survey_loc = new_loc
            elif survey_loc != new_loc:
                multiple_locs.add(surveyid)
        
            # version string is often different between IR and MR files, that's ok, 
            # we want to check for multiple versions per filetype
            if survey_version[filetype] is None:
                survey_version[filetype] = new_version
            elif survey_version[filetype] != new_version:
                multiple_versions_per_filetype.add(surveyid)

        

Check if there are any surveys with filetypes other than IR/MR

In [16]:
unknown_filetypes

set()

We expect that we may have both IR and MR files for a survey. If we have more than two files per (numeric) survey then it's either due to there being different location identifiers as with 156 above, or to some different filetype other than IR/MR, or to there being multiple versions present of the same survey (e.g. if we've re-downloaded and are working in the same folder after an update was released). Check which surveys are affected by this. 

This shows all surveys with more than one file per filetype, could be due to multiple locations OR versions:

In [17]:
multiple_files_per_filetype

{'580'}

This shows those truly with multiple versions per filetype at same location. If so then we will drop and reload the latest one only. (It's also quite common for the IR and MR files to have a different version. Don't worry about this.)

In [18]:
multiple_versions_per_filetype

{'580'}

In the case of one survey, number 156 (India) the download contains a main data file for India nationally (IAIR42), plus one for each state. (These break the convention by which the first two characters of the filename give the country code of the survey, because they are named for the state instead e.g. the file KEIR42 does not refer to Kenya but Kerala. This is one reason why we prepend and use the numeric survey id instead.) We do not need these state-wise files.

Here we check that there are no other surveys affected by this other than 156:

In [19]:
multiple_locs

set()

we only want to parse and load one version per filetype (IR/MR) and per survey.

### Check and load/reload column spec files as necessary

In [42]:
good_tbl_svys = []
drop_tbl_svys = []
new_tbl_svys = []

good_val_svys = []
reload_val_svys = []
new_val_svys = []

for surveyid, dict_type_filenames in tbl_files_dict.items():
    some_only_in_csv = False
    some_only_in_db = False
    
    locset = set()
    has_dupes = False
    for file_type, fn_ver_locs in dict_type_filenames.items():
        for _, _, loc in fn_ver_locs:
            locset.add(loc)
        if len(locset)>1:
            warnings.warn(f"""
            ****** WARNING MULTIPLE LOCATIONS FOUND FOR SURVEYID {surveyid} FILETYPE {file_type}. CANNOT DETERMINE WHICH TO LOAD *****
            ****** PLEASE REMOVE SURPLUS FILES AND RERUN FOR THIS WHOLE SURVEY*****
            """)
            has_dupes = True
    if has_dupes:
        continue
        
    # shortcut other checks, if nothing for this survey is in DB at all then we definitely need to load
    need_to_load_all_data = surveyid not in existing_table_svys.values
    if need_to_load_all_data: # no files or part files from this svy loaded
        for file_type, fn_ver_locs in dict_type_filenames.items():
            # the versions are strings such as '70' or '7A' where the first character 
            # represents the phase (fixed for a survey) and the second the actual version 
            # of this file. Anyway, natural sort order by version is ok.
            latest = sorted(fn_ver_locs, key=lambda tup: tup[1])[-1]
            file_name, file_ver, file_loc = latest
            print(f"No data for survey {surveyid} are present: "+
                  "loading latest data for {file_type.upper()} from {file_name}")
            if len(fn_ver_locs) > 1: print(f"   ({len(fn_ver_locs)} versions were available, loading {file_ver}")
            db_helper.load_new_metadata_file(file_name)
        continue
    
    # metadata for this survey are already present. That doesn't necessarily mean we don't need to 
    # load or reload this file. Check.
    for file_type, fn_ver_locs in dict_type_filenames.items():
        latest = sorted(fn_ver_locs, key=lambda tup: tup[1])[-1]
        file_name, file_ver, file_loc = latest
        
        # Check for the scenario where IR data are in the DB but MR data aren't, etc
        if not db_helper.get_any_in_db_cols(surveyid, file_type):
            print(f"{file_type.upper()} data for survey {surveyid} are not present: loading from {file_name}")
            if len(fn_ver_locs) > 1: print(f"    (of {len(fn_ver_locs)} available versions)")
            db_helper.load_new_metadata_file(file_name)
            continue
            
        db_ver, db_has_single_ver = db_helper.get_db_survey_version_cols(surveyid, file_type)
        if not db_has_single_ver:
            db_helper.drop_and_reload(file_name, 
                                      msg="Multiple versions found in DB, dropping all and loading latest")
        elif file_ver > db_ver:
            # this needs to be done for the data tables too
            db_helper.drop_and_reload(file_name, 
                                      msg=f"Newer version found - DB is version {db_ver} "+
                                      "and file is version {file_ver}")
        else:
            # check that the same rows are present in the CSV and DB
            file_cols_data = pd.read_csv(file_name)
            db_cols_data = db_helper.get_tablespec_rows_for_svy_filetype(surveyid, file_type)
            # do an outer join to find items present in files but not DB and vice versa
            merged_cols = file_cols_data.merge(db_cols_data, how='outer',
                                      left_on=['Name','RecordName','Label','Len'],
                                      right_on=['name','recordname', 'label', 'len'],
                                      suffixes=('_new', '_exist'),
                                      indicator=True)
            cols_all_good = len(merged_cols[merged_cols['_merge']!='both']) == 0
            if not cols_all_good:
                # Most likely reason for having new columns info is that we've added men's recode data,
                # but that won't be the case here as the check included filetype.
                # Reasons to be here would be it's a re-issued survey (same id number) and has added 
                # different columns (in which case there will be some left-only and no right-only ones), 
                # and/or that some column labels have changed (e.g. prefix of NA been added).
                some_only_in_db = len(merged_cols[merged_cols['_merge'] == 'right_only']) > 0
                some_only_in_csv = len(merged_cols[merged_cols['_merge'] == 'left_only']) > 0
                if some_only_in_csv and some_only_in_db:
                    # Not worth the effort to investigate: just drop everything from this survey and reload.
                    db_helper.drop_and_reload(file_name, 
                                              msg="Rows mismatch between file and DB")
                elif some_only_in_csv:
                    # Not worth the effort to investigate: just drop everything from this survey and reload.
                    db_helper.drop_and_reload(file_name, 
                                              msg="Rows present in file but not in DB")
                elif some_only_in_db:
                    # Not worth the effort to investigate: just drop everything from this survey and reload.
                    db_helper.drop_and_reload(file_name, 
                                              msg="Rows present in DB but not in file")
            else:
                pass
                #report_cols_all_good(surveyid, file_type, file_name)
        

### Check and load/reload value spec files as necessary

In [43]:
good_tbl_svys = []
drop_tbl_svys = []
new_tbl_svys = []

good_val_svys = []
reload_val_svys = []
new_val_svys = []

for surveyid, dict_type_filenames in val_files_dict.items():
    #if not surveyid.startswith('1'):
    #    continue
    some_only_in_csv = False
    some_only_in_db = False
    #if surveyid != '16':
     #   continue
    
    locset = set()
    has_dupes = False
    for file_type, fn_ver_locs in dict_type_filenames.items():
        for _, _, loc in fn_ver_locs:
            locset.add(loc)
        if len(locset)>1:
            print(f"""
            ****** WARNING MULTIPLE LOCATIONS FOUND FOR SURVEYID {surveyid} FILETYPE {file_type}. CANNOT DETERMINE WHICH TO LOAD *****
            ****** PLEASE REMOVE SURPLUS FILES AND RERUN FOR THIS WHOLE SURVEY*****
            """)
            has_dupes = True
    if has_dupes:
        continue
        
    # shortcut other checks, if nothing for this survey is in DB at all then we definitely need to load
    need_to_load_all_data = surveyid not in existing_value_svys.values
    if need_to_load_all_data: # no files or part files from this svy loaded
        for file_type, fn_ver_locs in dict_type_filenames.items():
            # the versions are strings such as '70' or '7A' where the first character 
            # represents the phase (fixed for a survey) and the second the actual version 
            # of this file. Anyway, natural sort order by version is ok.
            latest = sorted(fn_ver_locs, key=lambda tup: tup[1])[-1]
            file_name, file_ver, file_loc = latest
            print(f"No data for survey {surveyid} are present: "+
                  "loading latest data for {file_type.upper()} from {file_name}")
            if len(fn_ver_locs) > 1: print(f"   ({len(fn_ver_locs)} versions were available, loading {file_ver}")
            db_helper.load_new_metadata_file(file_name)
        continue
            
    for file_type, fn_ver_locs in dict_type_filenames.items():
        latest = sorted(fn_ver_locs, key=lambda tup: tup[1])[-1]
        file_name, file_ver, file_loc = latest
        
        # Check for the scenario where IR data are in the DB but MR data aren't, etc
        if not get_any_in_db_vals(surveyid, file_type):
            print(f"{file_type.upper()} value-spec data for survey {surveyid} are not present: loading from {file_name}")
            if len(fn_ver_locs) > 1: print(f"    (of {len(fn_ver_locs)} available versions)")
            db_helper.load_new_metadata_file(file_name)
            continue
            
        db_ver, db_has_single_ver = get_db_survey_version_vals(surveyid, file_type)
        if not db_has_single_ver:
            db_helper.drop_and_reload(file_name,
                            msg="Multiple versions found in DB, dropping all and loading latest")
        elif file_ver > db_ver:
            # this needs to be done for the data tables too
            db_helper.drop_and_reload(file_name,
                            msg=f"Newer version found - DB is version {db_ver} "+
                                      "and file is version {file_ver}")
        else:
            # check that the same rows are present in the CSV and DB
            file_vals_data = pd.read_csv(file_name)
            db_vals_data = db_helper.get_valuespec_rows_for_svy_filetype(surveyid, file_type)
            # do an outer join to find items present in files but not DB and vice versa
            merged_vals = file_vals_data.merge(db_vals_data, how='outer',
                                      left_on=['Name','Value','ValueDesc','ValueType'],
                                      right_on=['col_name','value', 'value_desc', 'value_type'],
                                      suffixes=('_new', '_exist'),
                                      indicator=True)
            vals_all_good = len(merged_vals[merged_vals['_merge']!='both']) == 0
            if not vals_all_good:
                # Most likely reason is that we now have men's recode data as well, but did not previously, 
                # but we have checked for that already.
                # Other reasons would be it's a re-issued survey (same id number) and has added different columns 
                # (in which case there will be some left-only and no right-only ones), and/or that some column 
                # labels have changed (e.g. prefix of NA been added).
                some_only_in_db = len(merged_vals[merged_vals['_merge'] == 'right_only']) > 0
                some_only_in_csv = len(merged_vals[merged_vals['_merge'] == 'left_only']) > 0
                if some_only_in_csv and some_only_in_db:
                    # Not worth the effort to investigate: just drop everything from this survey and reload.
                    db_helper.drop_and_reload(file_name, 
                                              msg="Rows mismatch between file and DB")
                elif some_only_in_csv:
                    # Not worth the effort to investigate: just drop everything from this survey and reload.
                    db_helper.drop_and_reload(file_name, 
                                              msg="Rows present in file but not in DB")
                elif some_only_in_db:
                    # Not worth the effort to investigate: just drop everything from this survey and reload.
                    db_helper.drop_and_reload(file_name, 
                                              msg="Rows present in DB but not in file")
                    
            else:
                pass
                #report_vals_all_good(surveyid, file_type, file_name)
        

Survey 1 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/1.ESIR01.FlatValuesSpec.csv) is already completely loaded
Survey 100 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/100.VNIR31.FlatValuesSpec.csv) is already completely loaded
Survey 101 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/101.CMIR31.FlatValuesSpec.csv) is already completely loaded
Survey 101 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/101.CMMR31.FlatValuesSpec.csv) is already completely loaded
Survey 105 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/105.MZIR31.FlatValuesSpec.csv) is already completely loaded
Survey 105 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/105.MZMR31.FlatValuesSpec.csv) is already completely loaded
Survey 106 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/106.CIIR3A.FlatValuesSpec.csv) is already completely loaded
Survey 106

Survey 172 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/172.IDMR42.FlatValuesSpec.csv) is already completely loaded
Survey 174 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/174.mzir41.FlatValuesSpec.csv) is already completely loaded
Survey 174 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/174.mzmr41.FlatValuesSpec.csv) is already completely loaded
Survey 176 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/176.PEIR41.FlatValuesSpec.csv) is already completely loaded
Survey 18 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/18.TTIR01.FlatValuesSpec.csv) is already completely loaded
Survey 184 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/184.ZWIR42.FlatValuesSpec.csv) is already completely loaded
Survey 184 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/184.ZWMR41.FlatValuesSpec.csv) is already completely loaded
Survey 1

Survey 248 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/248.ETIR51.FlatValuesSpec.csv) is already completely loaded
Survey 248 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/248.ETMR51.FlatValuesSpec.csv) is already completely loaded
Survey 249 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/249.GNIR52.FlatValuesSpec.csv) is already completely loaded
Survey 249 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/249.gnmr52.FlatValuesSpec.csv) is already completely loaded
Survey 25 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/25.GHIR02.FlatValuesSpec.csv) is already completely loaded
Survey 251 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/251.MWIR4E.FlatValuesSpec.csv) is already completely loaded
Survey 251 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/251.MWMR4E.FlatValuesSpec.csv) is already completely loaded
Survey 2

Survey 297 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/297.IDIR51.FlatValuesSpec.csv) is already completely loaded
Survey 297 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/297.IDMR51.FlatValuesSpec.csv) is already completely loaded
Survey 298 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/298.PHIR52.FlatValuesSpec.csv) is already completely loaded
Survey 3 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/3.SNIR02.FlatValuesSpec.csv) is already completely loaded
Survey 30 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/30.coir22.FlatValuesSpec.csv) is already completely loaded
Survey 300 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/300.KEIR52.FlatValuesSpec.csv) is already completely loaded
Survey 300 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/300.KEMR52.FlatValuesSpec.csv) is already completely loaded
Survey 301 I

Survey 355 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/355.IAIR74.FlatValuesSpec.csv) is already completely loaded
Survey 355 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/355.IAMR74.FlatValuesSpec.csv) is already completely loaded
Survey 356 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/356.NPIR61.FlatValuesSpec.csv) is already completely loaded
Survey 356 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/356.NPMR61.FlatValuesSpec.csv) is already completely loaded
Survey 357 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/357.IDIR63.FlatValuesSpec.csv) is already completely loaded
Survey 357 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/357.IDMR63.FlatValuesSpec.csv) is already completely loaded
Survey 358 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/358.YEIR61.FlatValuesSpec.csv) is already completely loaded
Survey

Survey 41 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/41.TZIR21.FlatValuesSpec.csv) is already completely loaded
Survey 41 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/41.TZMR21.FlatValuesSpec.csv) is already completely loaded
Survey 419 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/419.PKIR61.FlatValuesSpec.csv) is already completely loaded
Survey 419 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/419.PKMR61.FlatValuesSpec.csv) is already completely loaded
Survey 42 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/42.BFIR21.FlatValuesSpec.csv) is already completely loaded
Survey 42 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/42.BFMR21.FlatValuesSpec.csv) is already completely loaded
Survey 420 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/420.BJIR61.FlatValuesSpec.csv) is already completely loaded
Survey 420 MR 

Survey 468 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/468.RWIR70.FlatValuesSpec.csv) is already completely loaded
Survey 468 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/468.RWMR70.FlatValuesSpec.csv) is already completely loaded
Survey 47 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/47.MAIR21.FlatValuesSpec.csv) is already completely loaded
Survey 471 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/471.AFIR70.FlatValuesSpec.csv) is already completely loaded
Survey 471 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/471.AFMR70.FlatValuesSpec.csv) is already completely loaded
Survey 472 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/472.NPIR7H.FlatValuesSpec.csv) is already completely loaded
Survey 472 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/472.NPMR7H.FlatValuesSpec.csv) is already completely loaded
Survey 4

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Dropping and reloading rows from value_specs_blank for survey 475, filetype ir, 
              from file /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/475.ZWIR72.FlatValuesSpec.csv, reason: Rows mismatch between file and DB
Dropping ir rows for survey 475 from value_specs_blank
Inserting 475.ZWIR72.FlatValuesSpec.csv to value_specs_blank


  exec(code_obj, self.user_global_ns, self.user_ns)


Survey 475 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/475.ZWMR72.FlatValuesSpec.csv) is already completely loaded
Survey 476 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/476.COIR72.FlatValuesSpec.csv) is already completely loaded
Survey 476 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/476.COMR71.FlatValuesSpec.csv) is already completely loaded
Survey 477 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/477.AOIR71.FlatValuesSpec.csv) is already completely loaded
Survey 477 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/477.AOMR71.FlatValuesSpec.csv) is already completely loaded
Survey 478 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/478.ETIR71.FlatValuesSpec.csv) is already completely loaded
Survey 478 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/478.ETMR71.FlatValuesSpec.csv) is already completely loaded
Survey

Survey 53 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/53.NIMR21.FlatValuesSpec.csv) is already completely loaded
Survey 534 IR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/534.SNIR7Z.FlatValuesSpec.csv) is already completely loaded
Survey 534 MR values file (/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/534.SNMR7Z.FlatValuesSpec.csv) is already completely loaded
No data for survey 539 are present: loading latest data for IR from /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/539.GNIR71.FlatValuesSpec.csv
Inserting 539.GNIR71.FlatValuesSpec.csv to value_specs_blank
No data for survey 539 are present: loading latest data for MR from /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/539.GNMR71.FlatValuesSpec.csv
Inserting 539.GNMR71.FlatValuesSpec.csv to value_specs_blank
No data for survey 540 are present: loading latest data for IR from /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/540.RWIR7A.FlatValuesSpec

No data for survey 77 are present: loading latest data for MR from /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/77.MLMR31.FlatValuesSpec.csv
Inserting 77.MLMR31.FlatValuesSpec.csv to value_specs_blank
No data for survey 8 are present: loading latest data for IR from /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/8.OSIR01.FlatValuesSpec.csv
Inserting 8.OSIR01.FlatValuesSpec.csv to value_specs_blank
No data for survey 81 are present: loading latest data for IR from /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/81.SNIR32.FlatValuesSpec.csv
Inserting 81.SNIR32.FlatValuesSpec.csv to value_specs_blank
No data for survey 81 are present: loading latest data for MR from /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/81.SNMR31.FlatValuesSpec.csv
Inserting 81.SNMR31.FlatValuesSpec.csv to value_specs_blank
No data for survey 82 are present: loading latest data for IR from /mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/82.TZIR3A.FlatValuesSpec.csv
Insertin

Define some placeholder functions to say what we would do in the DB

In [40]:
DB_TABLESPEC_COLS = ['itemtype', 'recordname', 'recordtypevalue', 'recordlabel', 'name',
       'label', 'start', 'len', 'fmetype', 'surveyid', 'id', 'filecode']
DB_VALUESPEC_COLS = ['col_name', 'value', 'value_desc', 'value_type', 'surveyid', 'id',
       'filecode']

    
def add_varchar_col_to_table(schema_name, table_name, column_name, req_width):
    sql = f"""
        ALTER TABLE {schema_name}."{table_name}" 
        ADD COLUMN {column_name.lower()} CHARACTER VARYING ({req_width})"""
    return engine.execute(sql)
    
    
                
            
def delete_table_cols_for_survey(surveyid, dry_run=True):
    if dry_run:
        print(f"Would drop all rows for survey {surveyid} from {TABLESPEC_TABLENAME}")
        drop_tbl_svys.append(surveyid)
    else:
        print(f"Dropping all rows for survey {surveyid} from {TABLESPEC_TABLENAME}")
        meta = sa.MetaData()
        tablespec_db = sa.Table(TABLESPEC_TABLENAME, meta, schema=SPEC_SCHEMA, 
                                autoload=True, autoload_with=engine)
        cond = tablespec_db.c.surveyid == surveyid
        delete = tablespec_db.delete().where(cond)  
        res = engine.execute(delete)
    #pass

    
def delete_value_entries_for_survey(surveyid, dry_run=True):
    # only drop ones for matching columns because we get different ones from the IR and MR files
    if dry_run:
        print(f"Would drop all valuespec rows for survey {surveyid} from {VALUESPEC_TABLENAME}")
    else:
        print(f"Dropping all valuespec rows for survey {surveyid} from {VALUE_SPEC_TABLENAME}")
        meta = sa.MetaData()
        tablespec_db = sa.Table(TABLESPEC_TABLENAME, meta, schema=SPEC_SCHEMA, 
                                autoload=True, autoload_with=engine)
        cond = tablespec_db.c.surveyid == surveyid
        delete = tablespec_db.delete().where(cond)  
        res = engine.execute(delete)

                
def report_cols_all_good(surveyid, file_type, file_name):
    print(f"Survey {surveyid} {file_type.upper()} spec file ({file_name}) is already completely loaded")
    good_tbl_svys.append(os.path.basename(file_name).split('.')[0])
    

def report_vals_all_good(surveyid, file_type, file_name):
    print(f"Survey {surveyid} {file_type.upper()} values file ({file_name}) is already completely loaded")
    good_val_svys.append(os.path.basename(file_name).split('.')[0])

In [179]:
tbl_fn = "/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/505.MDIR71.FlatRecordSpec.csv"
val_fn = "/mnt/d/InformalCities/DHS_Updates/staging/parsed_specs/505.MDIR71.FlatValuesSpec.csv"


# Part 3 - Work out which data tables need to be (re)loaded

### Todo: transfer to next notebook

Find DHS data tables that exist in the surveys according to the (newly-updated) metadata tables, but which do not exist in the database yet

In [23]:
data_tables_missing = pd.read_sql(f"""
    SELECT DISTINCT recordname from {TABLE_SPEC_TABLE}
    WHERE recordname NOT IN (
        SELECT DISTINCT table_name AS recordname 
        FROM information_schema.columns
        WHERE table_schema = '{DATA_SCHEMA}')"""
        , con=engine)
data_tables_missing

Unnamed: 0,recordname
0,RECHEM
1,RECHAN2
2,MREC85
3,*
4,RECHCH1
5,RECHVC
6,MRECFG
7,RECHD
8,RECHAC
9,RECHAN3


Find data tables which have a JSON column - we won't check for "missing" columns on these in the same way

In [24]:
json_tables = pd.read_sql(f"""
    SELECT DISTINCT table_name 
    FROM information_schema.columns 
    WHERE table_schema = '{DATA_SCHEMA}' AND data_type = 'jsonb'""",
                          con=engine)   
json_tables

Unnamed: 0,table_name
0,REC91
1,REC94
2,REC95
3,REC97
4,RECH3
5,RECH4_json


 Find all data  columns which are needed, according to the metadata

In [25]:
data_columns_needed = pd.read_sql(f"""
    SELECT recordname, LOWER(name) AS name, MAX(len) AS maxlen 
    FROM {SPEC_SCHEMA}.{TABLESPEC_TABLENAME}
    WHERE recordname != '*'
    GROUP BY recordname, name
    ORDER BY recordname, name""", con=engine)
print(data_columns_needed.head())
print(len(data_columns_needed))

  recordname          name  maxlen
0     MREC01       mcaseid      15
1     MREC01  mrec01_group       5
2     MREC01         mv000       3
3     MREC01         mv001       8
4     MREC01         mv002       6
51324


Find all data columns which are actually present

In [26]:
cols_present = pd.read_sql(f"""SELECT table_name, column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'dhs_data_tables' ORDER BY table_name, column_name""", con=engine)
cols_present[cols_present['table_name']=='MREC01']

Unnamed: 0,table_name,column_name,data_type
34,MREC01,mcaseid,character varying
35,MREC01,mrec01_group,character varying
36,MREC01,mv000,character varying
37,MREC01,mv001,character varying
38,MREC01,mv002,character varying
39,MREC01,mv003,character varying
40,MREC01,mv004,character varying
41,MREC01,mv005,character varying
42,MREC01,mv006,character varying
43,MREC01,mv007,character varying


Eliminate step-by-step (for clarity) to figure out which if any need adding to the data tables. 

First eliminate everything that does exist in the tables. NB we are only checking column name here and it'd be more robust to compare record/tablename at the same time, however the data are structured such that this works for a demo.

In [27]:
cols_not_present = data_columns_needed[~data_columns_needed['name'].isin(cols_present['column_name'])]

Next eliminate everything that should be present but would be found in a table that's been stored as a JSONB type so isn't in information_schema

In [28]:
nonjson_cols_not_present= cols_not_present[~cols_not_present['recordname'].isin(json_tables['table_name'])]
nonjson_cols_not_present

Unnamed: 0,recordname,name,maxlen
32,MREC01,mv033,8
46,MREC11,md463e,1
55,MREC11,mv108,1
56,MREC11,mv109,1
57,MREC11,mv110,1
...,...,...,...
50991,RECHVC,hvc60,3
50992,RECHVC,hvc61,1
50993,RECHVC,hvc62,2
50994,RECHVC,hvc68,1


Next eliminate everything that belongs to a table which is entirely missing

These columns all need adding

In [29]:
cols_missing = nonjson_cols_not_present[~nonjson_cols_not_present['recordname'].isin(data_tables_missing['recordname'])]
#data_cols_missing[data_cols_missing['recordname']=='MREC01']
cols_missing

Unnamed: 0,recordname,name,maxlen
32,MREC01,mv033,8
46,MREC11,md463e,1
55,MREC11,mv108,1
56,MREC11,mv109,1
57,MREC11,mv110,1
...,...,...,...
50738,RECHML,sh129ak,1
50739,RECHML,sh129al,1
50740,RECHML,sh129am,1
50741,RECHML,sh129an,1


Do the same check again direct in an SQL query, joining on all the correct columns for safety

In [30]:
cols_missing_db = pd.read_sql("""
-- Identify columns described in the metadata that are missing from the data,
-- and need to be added to data tables
WITH 
-- all the columns that exist in the data tables schema
existing_cols AS 
    (
        SELECT table_name, column_name, data_type 
        FROM information_schema.columns 
        WHERE table_schema = 'dhs_data_tables'
    )
-- all the columns that are described in the metadta
, req_cols AS 
(
    SELECT recordname, LOWER(name) AS name, MAX(len) AS maxlen
    FROM dhs_survey_specs.table_specs_blank
    GROUP BY recordname, name
)
-- all the tables described in the metadata which are missing in their entirety
, whole_tables_missing AS 
    (
        SELECT DISTINCT recordname from dhs_survey_specs.table_specs_blank
        WHERE recordname NOT IN (
            SELECT DISTINCT table_name AS recordname 
            FROM information_schema.columns
            WHERE table_schema = 'dhs_data_tables')
    )
-- all the tables which have been stored as JSON type, so don't have all their 
-- described columns present in the tables according to information_schema
, json_tables AS 
    (
        SELECT DISTINCT table_name 
        FROM information_schema.columns WHERE data_type = 'jsonb'
    )

SELECT 
    req_cols.* 
    --, existing_cols.column_name AS test_joined
    --, json_tables.table_name AS json_tbl
    --, whole_tables_missing.recordname AS missing_tbl
FROM req_cols
-- NOT IN type checks seem to never complete whereas left join then check null 
-- is almost instant
LEFT JOIN existing_cols ON
 req_cols.name = existing_cols.column_name --AND req_cols.recordname = existing_cols.table_name 
LEFT JOIN json_tables ON
 req_cols.recordname = json_tables.table_name
LEFT JOIN whole_tables_missing ON
 req_cols.recordname = whole_tables_missing.recordname
WHERE existing_cols.column_name is NULL
AND json_tables.table_name IS null 
AND whole_tables_missing.recordname is null
--AND req_cols.recordname != '*'
ORDER BY recordname, name
""", con=engine)

In [31]:
cols_missing_db['recordname'].unique()

array(['MREC01', 'MREC11', 'MREC31', 'MREC32', 'MREC51', 'MREC61',
       'MREC71', 'MREC75', 'MREC83', 'MREC84', 'MREC91', 'MREC92',
       'REC41', 'REC92', 'RECH4', 'RECH5', 'RECH5S', 'RECH6', 'RECH6CS',
       'RECH6S', 'RECHMH', 'RECHML'], dtype=object)

In [32]:
for r,n,l in cols_missing_db.values:
    print(r,n,l)

MREC01 mv033 8
MREC11 md463e 1
MREC11 mv108 1
MREC11 mv109 1
MREC11 mv110 1
MREC11 mv111 1
MREC11 mv112 1
MREC11 mv148 1
MREC31 mv304a 1
MREC32 mrec32_group 32
MREC32 mv312a 2
MREC32 mv312b 2
MREC32 mv312c 2
MREC32 mv313a 1
MREC32 mv313b 1
MREC32 mv313c 1
MREC32 mv361 1
MREC32 mv362 1
MREC32 mv363 2
MREC32 mv364 1
MREC32 mv366a 1
MREC32 mv366b 1
MREC32 mv367 1
MREC32 mv375a 2
MREC32 mv375b 2
MREC32 mv375c 2
MREC32 mv376 2
MREC32 mv376a 1
MREC32 mv384e 1
MREC32 mv3b0 1
MREC32 mv3b1 1
MREC32 mv3b11 2
MREC32 mv3b12a 1
MREC32 mv3b12b 1
MREC32 mv3b12c 1
MREC32 mv3b12d 1
MREC32 mv3b12e 1
MREC32 mv3b12f 1
MREC32 mv3b12g 1
MREC32 mv3b12h 1
MREC32 mv3b12x 1
MREC32 mv3b13 1
MREC32 mv3b14a 1
MREC32 mv3b14b 1
MREC32 mv3b14c 1
MREC32 mv3b14d 1
MREC32 mv3b14e 1
MREC32 mv3b14f 1
MREC32 mv3b14g 1
MREC32 mv3b14h 1
MREC32 mv3b14i 1
MREC32 mv3b14j 1
MREC32 mv3b14x 1
MREC32 mv3b15a 1
MREC32 mv3b15b 1
MREC32 mv3b15c 1
MREC32 mv3b15d 1
MREC32 mv3b15e 1
MREC32 mv3b15f 1
MREC32 mv3b15g 1
MREC32 mv3b15h 1
MREC

MREC91 mv833f 1
MREC91 mv833g 1
MREC91 mv833h 1
MREC91 mv833x 1
MREC91 mv834d 1
MREC91 mv834e 1
MREC91 mv834f 1
MREC91 mv834g 1
MREC91 mv834h 1
MREC91 mv834i 1
MREC91 mv834j 1
MREC91 mv834k 1
MREC91 mv834x 1
MREC91 mv834y 1
MREC91 mv835d 1
MREC91 mv835e 1
MREC91 mv835f 1
MREC91 sb62x 6
MREC91 sb69 8
MREC91 sb69s 8
MREC91 shb53 3
MREC91 shb55 1
MREC91 shb56 3
MREC91 shb57 1
MREC91 shb61 1
MREC91 shb63 1
MREC91 sm002 2
MREC91 sm003a 2
MREC91 sm003b 1
MREC91 sm004 3
MREC91 sm005 3
MREC91 sm006 2
MREC91 sm016 1
MREC91 sm017 1
MREC91 sm021 5
MREC91 sm022 5
MREC91 sm025 1
MREC91 sm040 4
MREC91 sm051 11
MREC91 sm052 1
MREC91 sm1003ca 1
MREC91 sm1003cb 1
MREC91 sm1003cc 1
MREC91 sm1003cd 1
MREC91 sm1003ce 1
MREC91 sm1003cf 1
MREC91 sm1003ci 1
MREC91 sm1003cj 1
MREC91 sm1003ck 1
MREC91 sm1003cl 1
MREC91 sm1004ca 1
MREC91 sm1004cb 1
MREC91 sm1004cc 1
MREC91 sm1004cd 1
MREC91 sm1004ce 1
MREC91 sm1004cf 1
MREC91 sm1005b 1
MREC91 sm1005c 1
MREC91 sm1005ca 1
MREC91 sm1005cb 1
MREC91 sm1005cc 1
MREC9

MREC91 sm308h 1
MREC91 sm308i 1
MREC91 sm308j 1
MREC91 sm308k 1
MREC91 sm308u 1
MREC91 sm308x 1
MREC91 sm309w 1
MREC91 sm310a 1
MREC91 sm310ba 1
MREC91 sm310bb 1
MREC91 sm310bc 1
MREC91 sm310bd 1
MREC91 sm310be 1
MREC91 sm310bf 1
MREC91 sm310bg 1
MREC91 sm310bh 1
MREC91 sm310bi 1
MREC91 sm310bj 1
MREC91 sm310bx 1
MREC91 sm310by 1
MREC91 sm310c 5
MREC91 sm310d 2
MREC91 sm310e 1
MREC91 sm310fa 1
MREC91 sm310fb 1
MREC91 sm310fc 1
MREC91 sm310fd 1
MREC91 sm310fe 1
MREC91 sm310h 1
MREC91 sm310ia 1
MREC91 sm310ib 1
MREC91 sm310ic 1
MREC91 sm310id 1
MREC91 sm310ie 1
MREC91 sm310if 1
MREC91 sm310ig 1
MREC91 sm310ih 1
MREC91 sm310ii 1
MREC91 sm310ix 1
MREC91 sm310j 1
MREC91 sm310ja 1
MREC91 sm310jb 1
MREC91 sm310jc 1
MREC91 sm310jd 1
MREC91 sm310je 1
MREC91 sm310jf 1
MREC91 sm310jg 1
MREC91 sm310jh 1
MREC91 sm310jx 1
MREC91 sm310k 1
MREC91 sm310q 1
MREC91 sm310r 1
MREC91 sm311a 1
MREC91 sm311b 1
MREC91 sm311e 1
MREC91 sm311g 1
MREC91 sm311h 1
MREC91 sm311i 1
MREC91 sm311j 1
MREC91 sm311k 1
MREC

MREC91 sm413k 1
MREC91 sm413l 1
MREC91 sm413m 1
MREC91 sm413n 1
MREC91 sm413o 1
MREC91 sm413p 1
MREC91 sm413q 1
MREC91 sm413r 1
MREC91 sm413s 1
MREC91 sm413x 1
MREC91 sm414a 2
MREC91 sm414b 3
MREC91 sm414c 3
MREC91 sm414d 2
MREC91 sm414e 4
MREC91 sm414f 1
MREC91 sm414fa 1
MREC91 sm414fb 1
MREC91 sm414fc 1
MREC91 sm414fd 1
MREC91 sm414fe 1
MREC91 sm414fx 1
MREC91 sm414g 2
MREC91 sm414h 1
MREC91 sm414i 2
MREC91 sm414j 2
MREC91 sm414k 1
MREC91 sm414l 1
MREC91 sm414m 1
MREC91 sm414n 1
MREC91 sm414x 1
MREC91 sm414z 1
MREC91 sm415a 1
MREC91 sm415b 1
MREC91 sm415c 1
MREC91 sm415d 1
MREC91 sm415e 1
MREC91 sm415f 1
MREC91 sm416a 3
MREC91 sm416b 1
MREC91 sm416c 2
MREC91 sm416d 2
MREC91 sm416e 4
MREC91 sm416f 1
MREC91 sm416fa 1
MREC91 sm416fb 1
MREC91 sm416fc 1
MREC91 sm416fd 1
MREC91 sm416fe 1
MREC91 sm416ff 1
MREC91 sm416fg 1
MREC91 sm416fh 1
MREC91 sm416fi 1
MREC91 sm416fj 1
MREC91 sm416fk 1
MREC91 sm416fl 1
MREC91 sm416fm 1
MREC91 sm416fn 1
MREC91 sm416fo 1
MREC91 sm416fq 1
MREC91 sm416fs 1
M

MREC91 sm509c1d 1
MREC91 sm509c1e 1
MREC91 sm509c1f 1
MREC91 sm509c1g 1
MREC91 sm509c1h 1
MREC91 sm509c1i 1
MREC91 sm509ca 1
MREC91 sm509cc 1
MREC91 sm509d 1
MREC91 sm509d1a 1
MREC91 sm509d1b 1
MREC91 sm509d1c 1
MREC91 sm509da 1
MREC91 sm509db 1
MREC91 sm509dc 1
MREC91 sm509dd 1
MREC91 sm509de 1
MREC91 sm509df 1
MREC91 sm509dg 1
MREC91 sm509dh 1
MREC91 sm509di 1
MREC91 sm509dj 1
MREC91 sm509dk 1
MREC91 sm509dl 1
MREC91 sm509dx 1
MREC91 sm509e 1
MREC91 sm509f 1
MREC91 sm509g 1
MREC91 sm509h 1
MREC91 sm509ha 1
MREC91 sm509hb 1
MREC91 sm509hc 1
MREC91 sm509hd 1
MREC91 sm509he 1
MREC91 sm509hf 1
MREC91 sm509hx 1
MREC91 sm509i 1
MREC91 sm509j 1
MREC91 sm509k 1
MREC91 sm509l 1
MREC91 sm509m 1
MREC91 sm509n 1
MREC91 sm509o 1
MREC91 sm509p 1
MREC91 sm509q 1
MREC91 sm509r 1
MREC91 sm509s 1
MREC91 sm509x 1
MREC91 sm509z 1
MREC91 sm51 2
MREC91 sm510a 2
MREC91 sm510b 1
MREC91 sm510ba 1
MREC91 sm510bb 1
MREC91 sm510bc 1
MREC91 sm510bd 1
MREC91 sm510be 1
MREC91 sm510bf 1
MREC91 sm510bg 1
MREC91 sm51

MREC91 sm618lc 1
MREC91 sm618ld 1
MREC91 sm618le 1
MREC91 sm618lf 1
MREC91 sm618lx 1
MREC91 sm619 2
MREC91 sm619a 1
MREC91 sm619aa 1
MREC91 sm619ab 1
MREC91 sm619ac 1
MREC91 sm619ad 1
MREC91 sm619ax 1
MREC91 sm619ay 1
MREC91 sm619b 1
MREC91 sm619c 1
MREC91 sm619ca 1
MREC91 sm619cb 1
MREC91 sm619cc 1
MREC91 sm619cd 1
MREC91 sm619ce 1
MREC91 sm619cf 1
MREC91 sm619cg 1
MREC91 sm619cx 1
MREC91 sm619d 1
MREC91 sm619e 1
MREC91 sm619ea 1
MREC91 sm619eb 1
MREC91 sm619ec 1
MREC91 sm619ex 1
MREC91 sm619f 1
MREC91 sm619fa 1
MREC91 sm619fb 1
MREC91 sm619fc 1
MREC91 sm619fd 1
MREC91 sm619fe 1
MREC91 sm619ff 1
MREC91 sm619fg 1
MREC91 sm619fh 1
MREC91 sm619fi 1
MREC91 sm619fj 1
MREC91 sm619fk 1
MREC91 sm619fl 1
MREC91 sm619fm 1
MREC91 sm619fn 1
MREC91 sm619fo 1
MREC91 sm619fp 1
MREC91 sm619fq 1
MREC91 sm619fr 1
MREC91 sm619fx 1
MREC91 sm619fy 1
MREC91 sm619g 1
MREC91 sm619h 1
MREC91 sm619i 1
MREC91 sm619j 1
MREC91 sm619k 1
MREC91 sm619x 1
MREC91 sm619y 1
MREC91 sm619z 1
MREC91 sm620 2
MREC91 sm620a 2

MREC91 sm732ja 1
MREC91 sm732jb 1
MREC91 sm732jc 1
MREC91 sm732jd 1
MREC91 sm732je 1
MREC91 sm732jf 1
MREC91 sm732ka 1
MREC91 sm732kb 1
MREC91 sm732kc 1
MREC91 sm732kd 1
MREC91 sm732ke 1
MREC91 sm732kf 1
MREC91 sm732kg 1
MREC91 sm732x 1
MREC91 sm733 2
MREC91 sm733a 1
MREC91 sm733aa 1
MREC91 sm733ab 1
MREC91 sm733ac 1
MREC91 sm733ad 1
MREC91 sm733ae 1
MREC91 sm733af 1
MREC91 sm733ag 1
MREC91 sm733ah 1
MREC91 sm733ai 1
MREC91 sm733aj 1
MREC91 sm733ak 1
MREC91 sm733al 1
MREC91 sm733aw 1
MREC91 sm733ax 1
MREC91 sm733ay 1
MREC91 sm733az 1
MREC91 sm733b 1
MREC91 sm733ba 1
MREC91 sm733bb 1
MREC91 sm733bc 1
MREC91 sm733bd 1
MREC91 sm733be 1
MREC91 sm733bf 1
MREC91 sm733bg 1
MREC91 sm733bh 1
MREC91 sm733bi 1
MREC91 sm733bj 1
MREC91 sm733bk 1
MREC91 sm733bl 1
MREC91 sm733bw 1
MREC91 sm733bx 1
MREC91 sm733by 1
MREC91 sm733bz 1
MREC91 sm733c 1
MREC91 sm733d 1
MREC91 sm733e 1
MREC91 sm733f 1
MREC91 sm733g 1
MREC91 sm733h 1
MREC91 sm733i 1
MREC91 sm733j 1
MREC91 sm733k 1
MREC91 sm733l 1
MREC91 sm733

MREC91 sm827am 1
MREC91 sm827an 1
MREC91 sm827ao 1
MREC91 sm827ax 1
MREC91 sm827ba 1
MREC91 sm827bb 1
MREC91 sm827bc 1
MREC91 sm827bd 1
MREC91 sm827be 1
MREC91 sm827bf 1
MREC91 sm827bg 1
MREC91 sm827bh 1
MREC91 sm827bi 1
MREC91 sm827bj 1
MREC91 sm827bk 1
MREC91 sm827bl 1
MREC91 sm827bm 1
MREC91 sm827bn 1
MREC91 sm827bo 1
MREC91 sm827bx 1
MREC91 sm827ca 1
MREC91 sm827cb 1
MREC91 sm827cc 1
MREC91 sm827cd 1
MREC91 sm827ce 1
MREC91 sm827cf 1
MREC91 sm827cg 1
MREC91 sm827ch 1
MREC91 sm827ci 1
MREC91 sm827cj 1
MREC91 sm827ck 1
MREC91 sm827cl 1
MREC91 sm827cm 1
MREC91 sm827cn 1
MREC91 sm827co 1
MREC91 sm827cx 1
MREC91 sm828y 1
MREC91 sm829 2
MREC91 sm830a 1
MREC91 sm830b 1
MREC91 sm830c 1
MREC91 sm830d 1
MREC91 sm830e 1
MREC91 sm830f 1
MREC91 sm830g 1
MREC91 sm830h 1
MREC91 sm830i 1
MREC91 sm830x 1
MREC91 sm830y 1
MREC91 sm831b 1
MREC91 sm831c 1
MREC91 sm831d 3
MREC91 sm831e 1
MREC91 sm831f 1
MREC91 sm831g 1
MREC91 sm831h 1
MREC91 sm831i 1
MREC91 sm831s 3
MREC91 sm831y 1
MREC91 sm831z 1
MREC9

RECH5S sb112fw 4
RECH6 shc110 1
RECH6CS sb107a 1
RECH6S sb211fw 4
RECH6S sb218fw 4
RECH6S sha52ai 1
RECHMH shb127y 1
RECHML sh129aa 1
RECHML sh129ab 1
RECHML sh129ac 1
RECHML sh129ad 1
RECHML sh129ae 1
RECHML sh129af 1
RECHML sh129ag 1
RECHML sh129ah 1
RECHML sh129ai 1
RECHML sh129aj 1
RECHML sh129ak 1
RECHML sh129al 1
RECHML sh129am 1
RECHML sh129an 1
RECHML sh129ax 1


In [33]:
cols_missing_db

Unnamed: 0,recordname,name,maxlen
0,MREC01,mv033,8
1,MREC11,md463e,1
2,MREC11,mv108,1
3,MREC11,mv109,1
4,MREC11,mv110,1
...,...,...,...
6546,RECHML,sh129ak,1
6547,RECHML,sh129al,1
6548,RECHML,sh129am,1
6549,RECHML,sh129an,1


In [34]:
for datatable in bobs_excellent_list:
    data_table_cols_present = pd.read_sql(f"""
        SELECT table_name, column_name, data_type 
        FROM information_schema.columns 
        WHERE table_name = '{datatable}'""")

NameError: name 'bobs_excellent_list' is not defined