# DHS (CSPro) .DAT file parser

This workbook contains sample code for parsing .dat files in the CSPro format used by DHS in their "hierarchical" downloads. It splits each .dat file into one CSV file for each table or "recordtype" contained in that file.

It relies on first having parsed the dictionary specification file associated with a given .dat file(s), generating a "recordspec" file. This provides the necessary information for the code in this workbook to read a .dat file and identify how to split the characters found on a given line into individual columns of data associated with a particular table schema.

This parsing should therefore be done first (using the code in the DCF_Parser notebook, or similar). At present this notebook uses record spec files in CSV format but of course this could easily be modified to use the specification stuff that has been stored in the database in the dhs_table_specs_flat table.


In [1]:
import csv
import glob
from operator import itemgetter
import os

In [2]:
inSpecDir = r'\\map-fs1.ndph.ox.ac.uk\map_data\DHS_Automation\Acquisition\Update_Staging\parsed'
inDataDir = r'\\map-fs1.ndph.ox.ac.uk\map_data\DHS_Automation\Acquisition\Update_Staging\download'
outputCSVTableDir = r'\\map-fs1.ndph.ox.ac.uk\map_data\DHS_Automation\Acquisition\Update_Staging\tables'

In [3]:
allDatFiles = glob.glob(os.path.join(inDataDir, '*', '*.dat'))
nColsPerRecType = {}
# rtFieldInfoAllFiles = {}
# resGlobal = {}

In [4]:
allDatFiles

['\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Update_Staging\\download\\500\\500.JOMR71.DAT',
 '\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Update_Staging\\download\\500\\500.JOIR71.DAT',
 '\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Update_Staging\\download\\523\\523.PKMR71.DAT',
 '\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Update_Staging\\download\\523\\523.PKIR71.DAT',
 '\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Update_Staging\\download\\494\\494.MVIR71.DAT',
 '\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Update_Staging\\download\\494\\494.MVMR71.DAT',
 '\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Update_Staging\\download\\390\\390.ZADV71FL.DAT',
 '\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Update_Staging\\download\\390\\390.ZAIR71.DAT',
 '\\\\map-fs1.ndph.ox.ac.uk\\map_data\\DHS_Automation\\Acquisition\\Up

In [5]:
# Should we produce "global" outputs e.g. one RECH1 file containing RECH1 from all surveys
# as well as the table-by-survey outputs?
# This requires a lot of memory!. Probably not required now we are using a db for holding the output; 
# also it would only make sense if we were parsing all DHS surveys at once. 
# We'll just store one CSV per survey per record type.
doGlobalOutput = False

Parses all DAT files based on their record specifications, generated separately, into individual CSVs: one per file and per record type

In [7]:
if not os.path.exists(outputCSVTableDir):
    os.makedirs(outputCSVTableDir)
else:
    assert os.path.isdir(outputCSVTableDir)

for datFN in allDatFiles:
    # get the corresponding spec file
    filecode = os.path.extsep.join(os.path.basename(datFN).split(os.path.extsep)[:-1])
    specFileName = os.path.join(inSpecDir, '{0!s}.FlatRecordSpec.csv'.format(filecode))
    
    # See if we've already done this one
    outTestFN = os.path.join(outputCSVTableDir, 
                             '{0!s}.{1!s}.csv'.format(filecode,'REC01'))
    if os.path.exists(outTestFN):
        print ("Already did " + filecode)
        continue
    print ("Parsing " + filecode)
    
    # read the variables of this survey, specifying how to split the .dat
    with open(specFileName,'r') as dictFile:
        dictFileReader = csv.DictReader(dictFile)
        # the record type position info must be in the first line
        recordTypeInfo = next(dictFileReader)
        rtStart = int(recordTypeInfo['Start'])-1
        rtEnd = int(recordTypeInfo['Len']) + rtStart
        allVarsThisFile = [row for row in dictFileReader]
    for fieldInfo in allVarsThisFile:
        fieldInfo['Start'] = int(fieldInfo['Start'])
        fieldInfo['Len'] = int(fieldInfo['Len'])
    # sort them by record type then position in the row
    allVarsThisFileSorted = sorted(allVarsThisFile, 
                                   key=(itemgetter('RecordTypeValue','Start')))
    
    # build dictionary of record type (tablename) : list of its fields
    rtFieldInfoThisFile = {}
    for fieldInfo in allVarsThisFileSorted:
        recordTag = fieldInfo['RecordTypeValue']
        # do it for this file specifically
        if recordTag not in rtFieldInfoThisFile:
            rtFieldInfoThisFile[recordTag] = []
        rtFieldInfoThisFile[recordTag].append(fieldInfo)
        
        if doGlobalOutput:
           # build one across all files that contain this record type
           # requires lotsa memory!
            if recordTag not in rtFieldInfoAllFiles:
                rtFieldInfoAllFiles[recordTag] = {}
            fieldName = fieldInfo['Name']
            if fieldName not in rtFieldInfoAllFiles[recordTag]:
                rtFieldInfoAllFiles[recordTag][fieldName] = fieldInfo
            else:
                # the position of a given field in the line has to be the same each time
                assert rtFieldInfoAllFiles[recordTag][fieldName]['Start'] == fieldInfo['Start']
                assert rtFieldInfoAllFiles[recordTag][fieldName]['Len'] == fieldInfo['Len']

    # Now parse the survey data file itself
    # read the data and put everything into a list, in a dictionary keyed by record type
    res = {}
    colsPerRecTypeThisFile = {}
    with open(datFN, 'r') as data:
        linenum = 0
        for line in data:
            linenum += 1
            # the position of the recordtype in the line is the same across the entire file
            recordtype = line[rtStart:rtEnd]
            # get the spec for this type of row
            if recordtype not in rtFieldInfoThisFile:
                print ("Specification for recordtype '{0!s}' not found in file for {1!s} at line {2!s}".format(
                    recordtype, filecode, linenum))
                if linenum == 1:
                    print ("As this is the first line of the file, the problem may be that the file is" +
                    " saved with BOM which skews the byte count. Please re-save the .dat without BOM.")
                continue
                
            recordSpec = rtFieldInfoThisFile[recordtype]
            if recordtype not in res:
                res[recordtype] = []
            
            # split the column-aligned text according to the row specification
            # this is the part that is inefficient in FME (lots of list items)
            
            # The .DAT format allows a fixed width for each column of each recordtype.
            # Should we strip the whitespace? This is difficult. In general, yes we should 
            # but NOT in the case of the CASEID / HHID variables. The HHID is usually the CASEID
            # with the last 3 chars trimmed off, but if we trim "some" whitespace from HHID here 
            # then we can break that association and damage referential integrity.
            # On the other hand some joins are based on e.g. BIDX (recorded as len 2) 
            # to MIDX (recorded as len 1, despite containing same data), and we need 
            # to join on a single digit found in both so BIDX would need to be stripped.
            stripornot = lambda data, name: data if name in ('CASEID', 'HHID') else data.strip()
            rowParts = [stripornot(
                    (line[i['Start']-1 : i['Start']+i['Len']-1]),
                    i['Name']) 
                for i in recordSpec]
            
            if recordtype not in colsPerRecTypeThisFile:
                colsPerRecTypeThisFile[recordtype] = len(rowParts)
            else:
                assert len(rowParts) == colsPerRecTypeThisFile[recordtype]
            # add as a list to the list of rows for this record type
            res[recordtype].append(rowParts)#(",".join(rowParts))
    if doGlobalOutput:
        resGlobal[filecode] = res

    # write a csv for each record type
    for recordtype,fields in rtFieldInfoThisFile.items():
        if not recordtype in res:
            print ("No rows found for record type {0!s} in file {1!s} despite DCF specification".format(
                recordtype, filecode))
            continue
        fieldHeader = [i['Name'] for i in fields]
        fieldRecords = set([i['RecordName'] for i in fields])
        assert len(fieldRecords) == 1
        recName = fieldRecords.pop()
        outFN = os.path.join(outputCSVTableDir, '{0!s}.{1!s}.csv'.format(filecode,recName))
        with open(outFN, 'w', newline='') as outcsv:
            csvwriter = csv.writer(outcsv)
            csvwriter.writerow(fieldHeader)
            csvwriter.writerows(res[recordtype])

if doGlobalOutput:
    # write one file for each table. Each output table must have the unioned set of columns 
    # from each survey's version of this table.
    for filecode, fileres in resGlobal.iteritems():
        for recordtype,fields in rtFieldInfoAllFiles.items():
            allFieldsThisRecordSorted = sorted(fields, key=(itemgetter('Start')))
            groupFieldHeader = [i['Name'] for i in allFieldsThisRecordSorted]
            fieldRecords = set([i['RecordName'] for i in fields])
            assert len(fieldRecords) == 1
            recName = fieldRecords.pop()
            outGroupFN = os.path.join(outputCSVTableDir, '{0!s}_All.csv'.format(recName))
            if not os.path.exists(outGroupFN):
                # create the output file for this table
                with open(outGroupFN, 'w', newline='') as outcsv:
                    csvwriter = csv.writer(outcsv)
                    groupFieldHeader.insert(0,'FileCode')
                    groupFieldHeader = [i.lower() for i in groupFieldHeader]
                    csvwriter.writerow(groupFieldHeader)
                    colsPerRecType[recordtype]=len(groupFieldHeader)
               
            [i.insert(0,filecode) for i in fileres[recordtype]]
            with open(outGroupFN, 'a', newline='') as outcsv:
                csvwriter = csv.writer(outcsv)
                resThisFileThisRecord = fileres[recordtype]
                allSameLength = True
                for i in resThisFileThisRecord:
                    if len(i) != groupFieldHeader:
                        allSameLength = False
                        break
                if allSameLength:
                    csvwriter.writerows(resThisFileThisRecord)
                else:
                    for i in res:
                        pass
                        # Todo - write this 

                if len(res[recordtype][0]) != colsPerRecType[recordtype]:
                    print ("Warning! File {0!s} record type {1!s} has more cols than were defined in an earlier file"
                           .format(filecode, recordtype))
            


Parsing 500.JOMR71
Parsing 500.JOIR71
Parsing 523.PKMR71
Parsing 523.PKIR71
Parsing 494.MVIR71
Parsing 494.MVMR71
Parsing 390.ZADV71FL
Parsing 390.ZAIR71
Parsing 390.ZAMR71
