<a href="https://colab.research.google.com/github/kavyajeetbora/ETL_wages/blob/master/Annual_Survey_of_Industries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
import pandas as pd
from zipfile import ZipFile
from glob import glob
from tqdm import tqdm
from lxml import etree

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


## Functions

In [17]:
def read_xls_file(filename):
    with open(filename, 'r') as xl_file:
        lines = xl_file.readlines()
        raw_html = "".join(lines)
        table_html = "".join(lines[1:])

    parser = etree.HTMLParser()
    html_root = etree.fromstring(raw_html, parser)
    table_root = etree.fromstring(table_html, parser)
    html_string = etree.tostring(table_root, pretty_print=True, method="html")

    return html_root, html_string

def extract_text(elems):
    elem_text = []
    for elem in elems:
        if elem is not None:
            if elem.text is not None:
                elem_text.append(elem.text)
    return elem_text


def reformat_table(df):

    industry_elems = html_root.findall(r'.//tbody[4]/td')
    industries = extract_text(industry_elems)

    state_elems = html_root.findall(r'.//tbody[5]/td')
    state = extract_text(state_elems)[0]

    variable_elems = html_root.findall(r'.//tbody[6]/td')
    variables = extract_text(variable_elems)
    variables = pd.Series(variables).unique()

    ## Extract the time duration
    years = df['Year'].copy().reset_index(drop=True)
    year_vals = years.loc[years.index.repeat(len(industries))].reset_index(drop=True)

    ## Now prepare the dataframe
    input_vals = df.iloc[:,1:].copy()
    input_vals = input_vals.values.ravel(order='C').reshape(-1,len(variables))
    input_vals = pd.DataFrame(input_vals)
    input_vals.columns = variables

    ind_vals = industries*input_vals.shape[0]/len(industries)

    #assign year and ind
    input_vals['year'] = year_vals
    input_vals["ind"] = ind_vals
    input_vals['state'] = state

    return input_vals

## Reformat Tables

Upload the XL file and replace the `filepath ` value with the filename

In [3]:
zip_file = '/gdrive/MyDrive/10states_allsectors_1980_2020.zip'
with ZipFile(zip_file, 'r') as myzip:
    myzip.extractall()

In [4]:
xl_files = glob('*.xls')
print(len(xl_files))

160


In [10]:
dfx = pd.read_html(html_string)[0]
dfx

Unnamed: 0,Year,(Rs Lakh),(Rs Lakh).1,-,-.1,-.2,-.3,(Rs Lakh).2,(Rs Lakh).3,(Rs Lakh).4,...,-.94,-.95,(Rs Lakh).71,(Rs Lakh).72,(Rs Lakh).73,-.96,-.97,-.98,-.99,(Rs Lakh).74
0,1998 - 1999,4.0,-72.0,82.0,55.0,55.0,0.0,24.0,,,...,,,,,,,,,,
1,1999 - 2000,,,,,,,,0.0,-15.0,...,,,,,,,,,,
2,2000 - 2001,11.0,-2.0,140.0,99.0,88.0,11.0,42.0,,,...,,,,,,,,,,
3,2001 - 2002,6.0,30.0,158.0,117.0,99.0,18.0,47.0,,,...,,,,,,,,,,
4,2002 - 2003,23.0,13.0,145.0,109.0,97.0,12.0,49.0,,,...,,,,,,,,,,
5,2003 - 2004,42.0,110.0,177.0,137.0,126.0,11.0,76.0,,,...,,,,,,,,,,
6,2004 - 2005,4.0,-69.0,164.0,115.0,112.0,3.0,64.0,,,...,,,,,,,,,,
7,2005 - 2006,25.0,177.0,242.0,178.0,166.0,12.0,107.0,,,...,,,,,,,,,,
8,2006 - 2007,62.0,222.0,178.0,127.0,120.0,7.0,95.0,,,...,,,,,,,,,,
9,2007 - 2008,63.0,68.0,236.0,177.0,142.0,35.0,118.0,,,...,,,,,,,,,,


In [18]:
%%time

errors = []
clean_tables = []

for xl_file in tqdm(xl_files, unit='.xls'):

    try:
        ## Parse html content
        html_root, html_string = read_xls_file(xl_file)

        ## Convert data to table
        df = pd.read_html(html_string)[0]

        ## Reformat the table
        reformatted_table = reformat_table(df)
        clean_tables.append(reformatted_table)

    except Exception as e:
        errors.append({"Error": e, "file": xl_file})

100%|██████████| 160/160 [00:17<00:00,  8.97.xls/s]

CPU times: user 16.8 s, sys: 830 ms, total: 17.7 s
Wall time: 17.9 s





[]

In [19]:
df = pd.concat(clean_tables).reset_index(drop=True)
print(df.shape)
df.sample(5)

ValueError: No objects to concatenate

In [7]:
df.to_excel('Annual Survey of Industries.xlsx', index=False)