In [88]:
import pandas as pd
import calendar
import openpyxl
from openpyxl import load_workbook
import subprocess
import re
import os
import numpy as np
import collections

In [89]:
# Prepare variables
months = list(calendar.month_name[1:])
years = ['2017', '2018', '2019', '2020']
years_test = ['2018']
states = ["A & N Islands", "Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar", "Chandigarh", "Chhattisgarh", "Dadra & Nagar Haveli", "Daman & Diu", "Delhi", "Goa", "Gujarat", "Haryana", "Himachal Pradesh", "Jammu & Kashmir", "Jharkhand", "Karnataka", "Kerala", "Lakshadweep", "Madhya Pradesh", "Maharashtra", "Manipur", "Meghalaya", "Mizoram", "Nagaland", "Odisha", "Puducherry", "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu", "Telangana", "Tripura", "Uttar Pradesh", "Uttarakhand", "West Bengal"]
# states_2 = ["Chandigarh", "Chhattisgarh", "Dadra & Nagar Haveli", "Daman & Diu", "Delhi", "Goa", "Gujarat", "Haryana", "Himachal Pradesh", "Jammu & Kashmir", "Jharkhand", "Karnataka", "Kerala", "Lakshadweep", "Madhya Pradesh", "Maharashtra", "Manipur", "Meghalaya", "Mizoram", "Nagaland", "Odisha", "Puducherry", "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu", "Telangana", "Tripura", "Uttar Pradesh", "Uttarakhand", "West Bengal"]
states_test = ['Maharashtra']

In [90]:
# Hold data frame columns
df_columns = ['year', 'month', 'state', 'district']
df_column_labels = collections.OrderedDict()
df_column_labels['year'] = 'Year'
df_column_labels['month'] = 'Month'
df_column_labels['state'] = 'State name as string'
df_column_labels['district'] = 'District name as string'


In [94]:
def process_hmis_xls_file(state, year, month):
    row_dicts = []
    # Read xls file
    wb = load_workbook('data/converted/{}_{}_{}.xlsx'.format(year, state, month))
    ws = wb['Sheet1']    
    # Get the cell containing district names
    district_columns = {}
    district_range_left_bound = 'E10' 
    district_range_right_bound = ''   
    for cell_range in ws.merged_cells.ranges:
        excel_left_coord, excel_right_coord = cell_range.coord.split(':')
        if excel_left_coord == 'E9':
            district_range_right_bound = excel_right_coord[:-1] + '10'
            break
    district_range = openpyxl.worksheet.cell_range.CellRange('{}:{}'.format(district_range_left_bound, district_range_right_bound))
    for cell_range in ws.merged_cells.ranges:
        if cell_range.issubset(district_range):
            excel_left_coord, excel_right_coord = cell_range.coord.split(':')
            district_name = ws[excel_left_coord].value
            # Skip totals (for now)
            if district_name[0] == '_':
                continue
            else:
                district_columns[district_name] = excel_left_coord[:-2]
    for district_name, district_column in district_columns.items():
        district_data = {            
            'year': year,
            'month': month,
            'state': state,
            'district': district_name
        }
        # Hardcode row range for now        
        for row in range(12,548):
            # Construct column title
            prim_label_cell = 'C{}'.format(row)
            prim_label = ws[prim_label_cell].value
            if prim_label is None:
                label_range = openpyxl.worksheet.cell_range.CellRange('{}:{}'.format(prim_label_cell, prim_label_cell))                
                # Check in merged cells
                for cell_range in ws.merged_cells.ranges:
                    if label_range.issubset(cell_range):
                        excel_left_coord, excel_right_coord = cell_range.coord.split(':')
                        prim_label = ws[excel_left_coord].value         
            sec_label = ws['D{}'.format(row)].value        
            if sec_label != 'TOTAL':
                # Format a bit
                sec_label_formatted = re.sub(r'\d+\.\s+', '', sec_label)
                prim_label = prim_label + ' ' + sec_label_formatted
            if prim_label not in df_column_labels:
                df_column_labels[prim_label] = prim_label
            cell_value = ws['{}{}'.format(district_column, row)].value
            if cell_value == '':
                cell_value = np.nan            
            district_data[prim_label] = cell_value
        row_dicts.append(district_data)    
    return row_dicts    
        
    

In [64]:
process_hmis_xls_file('Maharashtra', '2018', 'January')

services': 477,
  'Out of registered, Girls received counselling': 581,
  'Out of registered, Boys received counselling': 513,
  'Number of on-going DOTS patients registered': 158,
  'Number of DOTS cases completed successfully': 78,
  'Outpatient - Diabetes': 219,
  'Outpatient - Hypertension': 396,
  'Outpatient - Stroke (Paralysis)': 8,
  'Outpatient - Acute Heart Diseases': 39,
  'Outpatient - Mental illness': 10,
  'Outpatient - Epilepsy': 2,
  'Outpatient - Ophthalmic Related': 2265,
  'Outpatient - Dental': 657,
  'Outpatient - Oncology': 3,
  'Allopathic- Outpatient attendance': 174641,
  'Ayush - Outpatient attendance': 11589,
  'Inpatient (Male)- Children<18yrs': 2043,
  'Inpatient (Male)- Adults': 3589,
  'Inpatient (Female)- Children<18yrs': 1575,
  'Inpatient (Female)- Adults': 5177,
  'Number of Left Against Medical Advice (LAMA) cases': 149,
  'Inpatient - Malaria': 1,
  'Inpatient - Dengue': 0,
  'Inpatient - Typhoid': 169,
  'Inpatient - Asthma, Chronic Obstructive Pul

In [92]:
# Write a function to convert xls to xlsx
def convert_to_xlsx(state, year, month):
    # Assume that file doesn't exist
    convert_cmd = subprocess.run(["/Applications/LibreOffice.app/Contents/MacOS/soffice", "--headless", "--convert-to", "xlsx", "--outdir", "data/converted/", "data/{}_{}_{}.xls".format(year, state, month)])
    if convert_cmd.returncode != 0:
        print('Error converting file for {} {} {}'.format(state, year, month))
        return False
    return True
    

In [66]:
convert_to_xlsx("A & N Islands", 2017, "September")

True

In [93]:
for state in states:
    state_rows = []
    for year in years:
        for month in months:
            # Check if file exists for the month
            if os.path.exists('data/{}_{}_{}.xls'.format(year, state, month)):
                # Proceed to check if xlsx file exists
                if os.path.exists('data/converted/{}_{}_{}.xlsx'.format(year, state, month)):
                    month_rows = process_hmis_xls_file(state, year, month)
                    state_rows.extend(month_rows)
                else:
                    conv_success = convert_to_xlsx(state, year, month)
                    if conv_success:
                        month_rows = process_hmis_xls_file(state, year, month)
                        state_rows.extend(month_rows)
                    else:
                        print("Unable to convert XLS file for {} {} {}".format(state, year, month))
            else:
                print("Skipping {} {} {} as no file found...".format(year, state, month))        
    # Create dataframe for values for a state across all years        
    state_df = pd.DataFrame.from_dict(state_rows)
    # Coerce all columns to numeric where possible
    cols = list(df_column_labels.keys())[4:]
    state_df[cols] = state_df[cols].apply(pd.to_numeric, axis=1)    
    state_df.to_csv('output/{}.csv'.format(state), index=False)            
    state_df.to_stata('output/{}.dta'.format(state), write_index=False, variable_labels=df_column_labels, version=118)
    print("{} written to file".format(state))    



Skipping 2017 A & N Islands January as no file found...
Skipping 2017 A & N Islands February as no file found...
Skipping 2017 A & N Islands March as no file found...


TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'