<a href="https://colab.research.google.com/github/kavyajeetbora/ETL_wages/blob/master/Annual_Survey_of_Industries%20-%20Nation%20Level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
import pandas as pd
from zipfile import ZipFile
from glob import glob
from tqdm import tqdm
from lxml import etree

## Functions

In [62]:
def read_xls_file(filename):
    with open(filename, 'r') as xl_file:
        lines = xl_file.readlines()
        raw_html = "".join(lines)
        table_html = "".join(lines[1:])

    parser = etree.HTMLParser()
    html_root = etree.fromstring(raw_html, parser)
    table_root = etree.fromstring(table_html, parser)
    html_string = etree.tostring(table_root, pretty_print=True, method="html")

    return html_root, html_string

def extract_text(elems):
    elem_text = []
    for elem in elems:
        if elem is not None:
            if elem.text is not None:
                elem_text.append(elem.text)
    return elem_text


def reformat_table(df, html_root):

    industry_elems = html_root.findall(r'.//tbody[6]/td')
    industries = extract_text(industry_elems)
    industries = list(sorted(set(industries)))

    variable_elems = html_root.findall(r'.//tbody[5]/td')
    variables = extract_text(variable_elems)
    variables = pd.Series(variables).unique()

    ## Extract the time duration
    years = df['Year'].copy().reset_index(drop=True)
    year_vals = years.loc[years.index.repeat(len(industries))].reset_index(drop=True)

    ## Now prepare the dataframe
    input_vals = df.iloc[:,1:].copy()
    input_vals = input_vals.values.ravel(order='C').reshape(-1,len(variables))
    input_vals = pd.DataFrame(input_vals)
    input_vals.columns = variables

    ind_vals = industries*len(years)

    #assign year and ind
    input_vals['year'] = year_vals
    input_vals["ind"] = ind_vals

    return input_vals

## Upload Zip File

Upload zip file containing all the `.xls` files in it:

In [3]:
%%time

from google.colab import files
uploaded = files.upload()

Saving 1970_nic_aggregate.zip to 1970_nic_aggregate.zip
CPU times: user 354 ms, sys: 34.5 ms, total: 389 ms
Wall time: 28.4 s


Extract all files within the zip file:

In [63]:
if len(uploaded.keys())>0:
    zip_file = list(uploaded.keys())[0]
    if zip_file.endswith(".zip"):
        ## Extract all contents
        with ZipFile(zip_file, 'r') as myzip:
            myzip.extractall()
    else:
        print("Not a zip file")

## Scan files

In [64]:
file_type = ['depreciation', 'factories', 'fuels', 'investedK', 'mandays', 'nva_totalstock']
## file_type = ['group_1',...........]

## Scan all xls files in the current directory
xl_files = glob('*.xls')

sheets = {}

for ftype in file_type:
    files = [f for f in xl_files if ftype in f]
    sheets[ftype] = files

## Extract and reformat tables

In [65]:
def extract_tables(files):
    errors = []
    clean_tables = []

    for xl_file in tqdm(files, unit='.xls'):

        try:
            ## Parse html content
            html_root, html_string = read_xls_file(xl_file)

            ## Convert data to table
            df = pd.read_html(html_string)[0]

            ## Reformat the table
            reformatted_table = reformat_table(df, html_root)
            clean_tables.append(reformatted_table)

        except Exception as e:
            errors.append({"Error": e, "file": xl_file})
            print(e)
            clean_tables = None
            break

    clean_tables = pd.concat(clean_tables)
    return clean_tables

In [66]:
%%time
sheet_tables = {}
for k,v in sheets.items():
    table = extract_tables(v)
    sheet_tables[k] = table

100%|██████████| 8/8 [00:00<00:00, 19.13.xls/s]
100%|██████████| 8/8 [00:00<00:00, 19.58.xls/s]
100%|██████████| 8/8 [00:00<00:00, 18.33.xls/s]
100%|██████████| 8/8 [00:00<00:00, 17.77.xls/s]
100%|██████████| 8/8 [00:00<00:00, 17.65.xls/s]
100%|██████████| 8/8 [00:00<00:00, 17.90.xls/s]

CPU times: user 2.68 s, sys: 7.05 ms, total: 2.69 s
Wall time: 2.69 s





In [69]:
sheet_tables['depreciation']

Unnamed: 0,Depreciation,Products and By-products,Value of Gross Output,Net Income,Profits,year,ind
0,156.0,9240.0,9440.0,,,1973 - 1974,376 - Manufacture of ...
1,27.0,343.0,972.0,,,1973 - 1974,377 - Manufacture of ...
2,1.0,20.0,22.0,,,1973 - 1974,378 - Manufacture of ...
3,32.0,1120.0,1193.0,,,1973 - 1974,379 - Manufacture of ...
4,217.0,4577.0,4763.0,,,1973 - 1974,380 - Manufacture of ...
...,...,...,...,...,...,...,...
379,32.0,154.0,469.0,73.0,0.0,1988 - 1989,"246 - Printing, dyeing ..."
380,20267.0,372096.0,391434.0,34096.0,-5474.0,1988 - 1989,"247 - Spinning, weaving ..."
381,3403.0,24899.0,66524.0,6047.0,-435.0,1988 - 1989,"248 - Printing, dyeing ..."
382,603.0,14456.0,15354.0,2095.0,1690.0,1988 - 1989,249 - Silk and synthetic ...


## Refactor final tables

In [73]:
df_final = sheet_tables[list(sheet_tables.keys())[0]]
for k,v in sheet_tables.items():
    if k!= list(sheet_tables.keys())[0]:
        df_final = pd.merge(
            left=df_final,
            right=v,
            on=['year', 'ind'],
            how='left'
        )

Unnamed: 0,Depreciation,Products and By-products,Value of Gross Output,Net Income,Profits,year,ind,Number of Factories,Fixed Capital,Working Capital,...,Number of Mandays - Employees,Total Persons Engaged,Wages and Salaries - Workers,PF and Other Benefits,Total Emoluments,Net Value Added,Gross Value Added,Net Fixed Capital Formation,Gross Fixed Capital Formation,Additions to Total Stock
0,156.0,9240.0,9440.0,,,1973 - 1974,376 - Manufacture of ...,464.0,1446.0,1280.0,...,,,654.0,,981.0,1608.0,1764.0,,,
1,27.0,343.0,972.0,,,1973 - 1974,377 - Manufacture of ...,4.0,341.0,919.0,...,,,133.0,,366.0,784.0,811.0,,,
2,1.0,20.0,22.0,,,1973 - 1974,378 - Manufacture of ...,14.0,6.0,5.0,...,,,2.0,,3.0,4.0,5.0,,,
3,32.0,1120.0,1193.0,,,1973 - 1974,379 - Manufacture of ...,137.0,308.0,229.0,...,,,110.0,,180.0,309.0,341.0,,,
4,217.0,4577.0,4763.0,,,1973 - 1974,380 - Manufacture of ...,334.0,2420.0,1695.0,...,,,614.0,,1064.0,1459.0,1676.0,,,


In [74]:
index_columns = ['year', 'ind']
variable_columns = [c for c in df_final.columns if c not in index_columns]
index_columns.extend(variable_columns)
df_final = df_final[index_columns]

In [76]:
df_final.sample(5)

Unnamed: 0,year,ind,Depreciation,Products and By-products,Value of Gross Output,Net Income,Profits,Number of Factories,Fixed Capital,Working Capital,...,Number of Mandays - Employees,Total Persons Engaged,Wages and Salaries - Workers,PF and Other Benefits,Total Emoluments,Net Value Added,Gross Value Added,Net Fixed Capital Formation,Gross Fixed Capital Formation,Additions to Total Stock
3134,1982 - 1983,242 - Wool spinning and ...,12.0,683.0,787.0,86.0,,31.0,172.0,476.0,...,597.0,2153.0,57.0,5.0,82.0,112.0,124.0,17.0,29.0,20.0
2960,1977 - 1978,250 - Jute and mesta ...,1.0,407.0,467.0,32.0,,71.0,20.0,47.0,...,200.0,,8.0,,18.0,37.0,38.0,,,
2997,1978 - 1979,250 - Jute and mesta ...,3.0,751.0,891.0,4.0,,79.0,413.0,260.0,...,300.0,,24.0,,44.0,41.0,44.0,,,
76,1976 - 1977,383 - Manufacture of ...,10.0,247.0,534.0,198.0,,618.0,123.0,129.0,...,1000.0,,132.0,,152.0,222.0,232.0,,,
2801,1973 - 1974,250 - Jute and mesta ...,1.0,381.0,397.0,,,293.0,20.0,39.0,...,,,11.0,,18.0,26.0,28.0,,,


## Export the table

In [None]:
df_final.to_excel('01_Annual Survey of Industries-National_Level.xlsx', index=False)