### Simple script to go through all the REC* tables in the DHS database and add indexes to all the key columns 

Rationale - we loaded all the parsed .DHS tables to a postgres database with a straightforward dump and now want to automate the process of indexing the fields we are likely to want to join on

In [1]:
import psycopg2


In [2]:
conn_string = "host='map-fs1.ndph.ox.ac.uk' port='5433' dbname='dhs_data_pit' \
    user='admin' password='##PASSWORD##'"

In [3]:
conn = psycopg2.connect(conn_string)

In [4]:
crs = conn.cursor()

In [5]:
# Figure out the table names we want to look at, they are those starting with REC or MREC
crs.execute("select relname from pg_class where relkind='r' and relname ~ 'REC';")
tblNames = [i[0] for i in crs.fetchall()]

In [6]:
crs.execute("select relname from pg_class where relkind='i'")
existingIndexes = [i[0] for i in crs.fetchall()]

Identifying the columns to index is based on knowledge of the content of the database: they all have "id" in the names, in lowercase. 

We're not doing anything more sophisticated than that for now (such as, for example trying to be intelligent with the "relationships" part of the .dcf), partly because it doesn't necessarily cover all possible joins and partly because we haven't really needed to!

However there are a few columns we ought to index that get missed by this simple strategy or for joins that are outside the normal hierarchy but would nonetheless be possible - for example REC01.V034 represents "line number of husband" so it might be desirable to join based on this field to the husband's record to create some kind of couple's dataset. For now, we haven't addressed this.

In [7]:
runIt = 0
allUpper = 0
replaceExisting = 0

We're just going to make an index on each *ID* column, and covering indice(s) across multiple of them for tables which have more than one.

In [None]:
cleanSQLTemplate = "DROP INDEX IF EXISTS dhs_data_tables.{0};"
idxSQLTemplate = "CREATE INDEX {0} ON {1}({2});"
idxNameTemplate = "{0}_{1}"
allSQL = []
# Create an index for each column with "id" in the title, and if there 
# is more than one (e.g. caseid, bidx) then also create a composite index
for tblName in tblNames:
    print tblName
    crs.execute('SELECT * FROM dhs_data_tables."{0}" LIMIT 1'.format(tblName))
    
    if allUpper:
        tblName = str.upper(tblName)
    # note the lowercase will exclude the OGC_FID column which is already indexed
    idxfields = [c[0] for c in crs.description if c[0].find("id") != -1]
    
    # create a separate index on each joining column
    #dropidxStmts = [cleanSQLTemplate.format(idxNameTemplate.format(c, str.lower(tblName)))
    #                for c in idxfields]
    dropidxStmts = []
    idxStmts = []
    print idxfields
    
    for c in idxfields:
        idxName = idxNameTemplate.format(c, str.lower(tblName))
        idxStmt = idxSQLTemplate.format(idxName, 'dhs_data_tables."'+ tblName + '"', c)
        if idxName in existingIndexes:
            if replaceExisting:
                dropidxStmt = cleanSQLTemplate.format(idxName)
                dropidxStmts.append(dropidxStmt)
                idxStmts.append(idxStmt)
                print "Replacing idx "+idxName
            else:
                print "Skipped existing idx "+idxName
        else:
            idxStmts.append(idxStmt)
            print "Adding idx "+idxName
    #idxStmts = [idxSQLTemplate.format(idxNameTemplate.format(c, str.lower(tblName)),
    #                                  'dhs_data_tables."'+tblName+'"', c) for c in idxfields]
    
    # also create a single covering index on all joining columns
    if len(idxfields) > 1:
        idxName = idxNameTemplate.format("allidx", str.lower(tblName))
        allStmt = idxSQLTemplate.format(idxName,
                                        'dhs_data_tables."'+tblName+'"', ",".join(idxfields))
        if idxName in existingIndexes:
            if replaceExisting:
                dropAllStmt = cleanSQLTemplate.format(idxName)
                dropidxStmts.append(dropAllStmt)
                idxStmts.append(allStmt)
                print "Replacing idx "+idxName
            else:
                print "Skipped existing idx "+idxName
        else:
            idxStmts.append(allStmt)
            print "Adding idx "+idxName
    
    # also create a covering index on the first two joining columns if there are three 
    # (or all except the last one, if there's more)
    # e.g. surveyid and caseid but not bidx (the cols are in the appropriate order in the CSVs)
    if len(idxfields) > 2:
        idxName = idxNameTemplate.format("twoidx", str.lower(tblName))
        allStmt = idxSQLTemplate.format(idxName,
                                        'dhs_data_tables."'+tblName+'"', ",".join(idxfields[:-1]))
        if idxName in existingIndexes:
            if replaceExisting:
                dropAllStmt = cleanSQLTemplate.format(idxName)
                dropidxStmts.append(dropAllStmt)
                idxStmts.append(allStmt)
                print "Replacing idx "+idxName
            else:
                print "Skipped existing idx "+idxName
        else:
            idxStmts.append(allStmt)
            print "Adding idx "+idxName
    

    dropIndexSQL = "\n".join(dropidxStmts)
    indexSQL = "\n".join(idxStmts)
    if runIt:
        if len(dropIndexSQL) > 0:
            crs.executescript(dropIndexSQL)
        if len(indexSQL) > 0:
            crs.executescript(indexSQL)
    if len(dropIndexSQL)>0: 
        allSQL.append(dropIndexSQL)
    if len(indexSQL) > 0:
        allSQL.append(indexSQL)
if runIt:
    dbconn.commit()
    dbconn.close()

In [None]:
print "\n".join(allSQL)

In [None]:
for l in allSQL:
    if len(l) > 0:
        print l
        crs.execute(l)
conn.commit()