In [1]:
import fitz
import pandas as pd
import re 

print(fitz.__doc__)


PyMuPDF 1.19.4: Python bindings for the MuPDF 1.19.0 library.
Version date: 2022-01-01 00:00:01.
Built for Python 3.7 on darwin (64-bit).



In [2]:
import pickle

with open('../DATA/states.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    states = pickle.load(f)
    
with open('../DATA/districts.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    districts = pickle.load(f)
    
with open('../DATA/indicators.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    indicators = pickle.load(f)


In [3]:
# print(states)
# print(districts)
# print(indicators)
ind = indicators[0] + indicators[1] + indicators[2]
print(len(ind))

107


In [4]:
def getstats(state, d):
    #print(state, ', ', d, ':')
    try: 
        doc  = fitz.open('../DATA/NFHS5/' + states[state] + '_' + d + '.pdf') 
    except: 
        print(states[state]+'_'+d+'.pdf : No such file found\n')
        return []
    else:  
        # STEP 1: READ PAGE USING 'fitz' AND REMOVE ALL '\n' AND WHITESPACES        
        # PAGE 3:
        page1 = doc.load_page(2)
        text1 = page1.get_text('text')
        t1    = text1.replace('\n', '')
        para1 = t1.replace(' ', '')

        # PAGE 4:
        page2 = doc.load_page(3)
        text2 = page2.get_text('text')
        t2    = text2.replace('\n', '')
        para2 = t2.replace(' ', '')

        # PAGE 5:
        if state == 'West_Bengal' and d == 'Jalpaiguri':
            page3 = doc.load_page(5)
        else:
            page3 = doc.load_page(4)
        text3 = page3.get_text('text')
        t3    = text3.replace('\n', '')
        para3 = t3.replace(' ', '')

        doc.close()
        para = para1 + para2 + para3

        # STEP 2: CORRECTIONS 
        if (d == 'Wardha' and state == 'Maharashtra') or (d == 'Mahisagar' and state == 'Gujarat'):
            for i in range(9, 31):
                rightside = ind[i].split('.')[1]
                leftside  = ind[i].split('.')[0]
                if i == 11:
                    rightside = 'Householdswithanyusualmembercoveredbyahealthschemeorhealthinsurance(%)'
                idot = para.find(rightside) 
                para = para[:idot-len(leftside)-1]+ind[i]+para[idot+len(rightside):]
        if state == 'Puducherry':
            phrase = 'HypertensionamongAdults(age15yearsandabove)Womenna92.Mildlyelevatedbloodpressure(Systolic140-159mmofHgand/orDiastolic90-99mmofHg)(%)'
            para   = para.replace(phrase, ind[len(indicators[0])+len(indicators[1])+25])
        if (d == 'Raigarh' and state == 'Maharashtra'):
            phrase = 'TobaccoUseandAlcoholConsumptionamongAdults(age15yearsandabove)na101.Womenage15yearsandabovewhouseanykindoftobacco(%)'
            para    = para.replace(phrase, ind[len(indicators[0])+len(indicators[1])+34])
            phrase = 'na102.Menage15yearsandabovewhouseanykindoftobacco(%)'
            para    = para.replace(phrase, ind[len(indicators[0])+len(indicators[1])+35])

        # STEP 3: Read numerical data in string form
        previndx = 0
        begin_indx = []
        for indic in ind:
            previndx = para.find(indic)
            begin_indx.append(previndx)
        if -1 in begin_indx:
            print(state, d, ':', begin_indx, ' : ', len(begin_indx), '\n')
            print(para2)
        end_indx = [begin_indx[i]+len(ind[i]) for i in range(0,len(begin_indx))]
        stats = []
        for i in range(0,len(ind)-1):
            stats.append(para[end_indx[i]:begin_indx[i+1]])
        del stats[len(indicators[0] + indicators[1])-1]
        del stats[len(indicators[0])-1]
        
        return stats, 'NFHS-4(2015-16)' in para


In [5]:
        # Error codes  ->   0: None, 
        #                   1: based on 25-49 unweighted data points, 
        #                   2: not available, 
        #                   3: not shown, based on < 25 data points
        #                   4: no comparable estimates are available from NFHS-4 in this 
        #                      district due to district boundary changes or a newly formed district. 

def writeStatsToDF(stats, isNFHS4):
    err_NFHS4 = []
    err_NFHS5 = []
    stats_NFHS4 = []
    stats_NFHS5 = []    
    if isNFHS4: # both NFHS-4 and NFHS-5 data available
        for i in range(0,len(stats)): 
            s = stats[i]
            errcode = [0, 0]
            if s[0] == '(':
                errcode[0] = 1
                m = s.find(')')-2
                s = ''.join(re.split('[\(\)]', s))
            if s[len(s)-1] == ')':
                errcode[1] = 1
                m = s.find('(')-1
                s = ''.join(re.split('[\(\)]', s))
            if s[0] == 'n':
                errcode[0] = 2
                m = s.find('a')
            if s[len(s)-1] == 'a':
                errcode[1] = 2
                m = len(s)-3
            if s[0] == '*':
                errcode[0] = 3
                m = 0
            if s[len(s)-1] == '*':
                errcode[1] = 3
                m = len(s)-2
            err_NFHS5.append(errcode[0])
            err_NFHS4.append(errcode[1])
            if errcode == [0, 0]:
                if i not in [2,3,38]:
                    m = s.find('.')+1
                else:
                    m = s.find(',')
                    if m == 1 or m == 2:
                        # comma in second position from left
                        m = m + 3
                    elif m == len(s) - 1 - 3:
                        # comma in 4th position from right
                        m = m - 2
                    else:
                        # no comma found, so 999 or less on each column
                        m = 2
            if m > -1:
                #stats_num.append([s[:m+1], s[m+1:]])
                stats_NFHS5.append(s[:m+1]) 
                stats_NFHS4.append(s[m+1:]) 
            else:
                print(state, d, ':', s, len(s))
        stats_NFHS4 = [re.sub('[,\(\)\*]', '', s) for s in stats_NFHS4]
        stats_NFHS4 = [re.sub('na', '', s) for s in stats_NFHS4]
        stats_NFHS5 = [re.sub('[,\(\)\*]', '', s) for s in stats_NFHS5]
        stats_NFHS5 = [re.sub('na', '', s) for s in stats_NFHS5]
    else:   # only NFHS-5 data available
        for s in stats:
            errcode = [0, 4]
            if s[0] == '(':
                errcode[0] = 1
            if s[0] == 'n':
                errcode[0] = 2
            if s[0] == '*':
                errcode[0] = 3
            #err_code.append(errcode)
            err_NFHS5.append(errcode[0])
            err_NFHS4.append(errcode[1])
            stats_NFHS5.append(s)
            #stats_NFHS5.append(''.join(re.split('[na\(\)\*,]', s)))
        stats_NFHS5 = [re.sub('[,\(\)\*]', '', s) for s in stats_NFHS5]
        stats_NFHS5 = [re.sub('na', '', s) for s in stats_NFHS5]
        stats_NFHS4 = ['' for s in stats_NFHS5]    

    return stats_NFHS4, stats_NFHS5, err_NFHS4, err_NFHS5



In [6]:
# for state in states:
#     for d in districts[state]:
#         getstats(state, d)
        
# stats, isNFHS4 = getstats('Puducherry', 'Mahe')
# stats_NFHS4, stats_NFHS5, err_NFHS4, err_NFHS5 = writeStatsToDF(stats, isNFHS4)
# print(stats_NFHS5)
# print(stats_NFHS4)

In [9]:
import progressbar

widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ',
            progressbar.Bar('*'),' (',progressbar.ETA(), ') ',
          ]

bar = progressbar.ProgressBar(max_value=704,
                          widgets=widgets).start()


str_nfhs5 = ['Q'+str(i)+'_NFHS5' for i in range(1, len(ind)-2)]
str_nfhs4 = ['Q'+str(i)+'_NFHS4' for i in range(1, len(ind)-2)]
err_nfhs5 = ['err'+str(i)+'_NFHS5' for i in range(1, len(ind)-2)]
err_nfhs4 = ['err'+str(i)+'_NFHS4' for i in range(1, len(ind)-2)]
cols      = ['State', 'District']+str_nfhs5+str_nfhs4+err_nfhs5+err_nfhs4

df = pd.DataFrame(columns = cols)

i = 0
for state in states: #[s for s in states if s not in ['Chandigarh', 'Lakshadweep']]:
    for d in districts[state]:
        stats, isNFHS4 = getstats(state, d)
        stats_NFHS4, stats_NFHS5, err_NFHS4, err_NFHS5 = writeStatsToDF(stats, isNFHS4)
        df.loc[len(df.index)] = [state, d] + stats_NFHS5 + stats_NFHS4 + err_NFHS5 + err_NFHS4        
        bar.update(i)
        i += 1


 [elapsed time: 0:00:58] |**********************************| (ETA:  00:00:00) 

In [10]:
df

Unnamed: 0,State,District,Q1_NFHS5,Q2_NFHS5,Q3_NFHS5,Q4_NFHS5,Q5_NFHS5,Q6_NFHS5,Q7_NFHS5,Q8_NFHS5,...,err95_NFHS4,err96_NFHS4,err97_NFHS4,err98_NFHS4,err99_NFHS4,err100_NFHS4,err101_NFHS4,err102_NFHS4,err103_NFHS4,err104_NFHS4
0,Andhra_Pradesh,Anantapur,59.5,24.3,1047,881,94.3,87.2,99.6,98.8,...,2,2,2,2,2,2,2,2,2,2
1,Andhra_Pradesh,Chittoor,65.6,22.6,1024,1019,94.7,74.3,99.7,98.5,...,2,2,2,2,2,2,2,2,2,2
2,Andhra_Pradesh,East Godavari,75.4,20.5,995,882,93.0,68.2,98.8,97.9,...,2,2,2,2,2,2,2,2,2,2
3,Andhra_Pradesh,Guntur,64.9,22.4,1055,941,92.3,82.5,99.2,99.3,...,2,2,2,2,2,2,2,2,2,2
4,Andhra_Pradesh,Krishna,74.0,20.4,1064,1139,96.4,86.3,99.6,94.4,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,Ladakh,Leh (Ladakh),66.3,21.0,967,949,98.7,77.7,99.8,87.1,...,2,2,2,2,2,2,2,2,2,2
701,Puducherry,Karaikal,86.3,21.9,1132,884,99.1,95.9,99.6,99.9,...,2,2,2,2,2,2,2,2,2,2
702,Puducherry,Mahe,99.2,16.8,1164,1202,99.4,96.8,99.7,98.0,...,2,2,2,2,2,2,2,2,2,2
703,Puducherry,Puducherry,83.8,20.1,1106,963,99.6,91.3,100.0,100.0,...,2,2,2,2,2,2,2,2,2,2


In [11]:
df.to_csv('../DATA/NFHS5.csv')