<a href="https://colab.research.google.com/github/kavyajeetbora/ETL_wages/blob/master/Annual_Survey_of_Industries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [48]:
import pandas as pd
from zipfile import ZipFile
from glob import glob
from tqdm import tqdm
from lxml import etree

## Functions

In [92]:
def read_xls_file(filename):
    with open(filename, 'r') as xl_file:
        lines = xl_file.readlines()
        raw_html = "".join(lines)
        table_html = "".join(lines[1:])

    parser = etree.HTMLParser()
    html_root = etree.fromstring(raw_html, parser)
    table_root = etree.fromstring(table_html, parser)
    html_string = etree.tostring(table_root, pretty_print=True, method="html")

    return html_root, html_string

def extract_text(elems):
    elem_text = []
    for elem in elems:
        if elem is not None:
            if elem.text is not None:
                elem_text.append(elem.text)
    return elem_text


def reformat_table(df, html_root):

    industry_elems = html_root.findall(r'.//tbody[4]/td')
    industries = extract_text(industry_elems)
    industries = list(sorted(set(industries)))

    state_elems = html_root.findall(r'.//tbody[5]/td')
    state = extract_text(state_elems)[0]

    variable_elems = html_root.findall(r'.//tbody[6]/td')
    variables = extract_text(variable_elems)
    variables = pd.Series(variables).unique()

    ## Extract the time duration
    years = df['Year'].copy().reset_index(drop=True)
    year_vals = years.loc[years.index.repeat(len(industries))].reset_index(drop=True)

    ## Now prepare the dataframe
    input_vals = df.iloc[:,1:].copy()
    input_vals = input_vals.values.ravel(order='C').reshape(-1,len(variables))
    input_vals = pd.DataFrame(input_vals)
    input_vals.columns = variables

    ind_vals = industries*len(years)

    #assign year and ind
    input_vals['year'] = year_vals
    input_vals["ind"] = ind_vals
    input_vals['state'] = state

    return input_vals

## Upload Zip File

Upload zip file containing all the `.xls` files in it:

In [85]:
%%time
# # Delete all files in the content folder
# !rm -r /content/*

from google.colab import files
uploaded = files.upload()

Saving 10states_allsectors_1980_2020.zip to 10states_allsectors_1980_2020.zip
CPU times: user 427 ms, sys: 47.5 ms, total: 475 ms
Wall time: 29.3 s


Extract all files within the zip file:

In [86]:
if len(uploaded.keys())>0:
    zip_file = list(uploaded.keys())[0]
    if zip_file.endswith(".zip"):
        ## Extract all contents
        with ZipFile(zip_file, 'r') as myzip:
            myzip.extractall()
    else:
        print("Not a zip file")

## Scan files

In [93]:
file_type = ['factories', 'gross', 'mandays', 'rent']
## file_type = ['group_1',...........]

## Scan all xls files in the current directory
xl_files = glob('*.xls')

sheets = {}

for ftype in file_type:
    files = [f for f in xl_files if ftype in f]
    sheets[ftype] = files

## Extract and reformat tables

In [94]:
def extract_tables(files):
    errors = []
    clean_tables = []

    for xl_file in tqdm(files, unit='.xls'):

        try:
            ## Parse html content
            html_root, html_string = read_xls_file(xl_file)

            ## Convert data to table
            df = pd.read_html(html_string)[0]

            ## Reformat the table
            reformatted_table = reformat_table(df, html_root)
            clean_tables.append(reformatted_table)

        except Exception as e:
            errors.append({"Error": e, "file": xl_file})
            print(e)
            clean_tables = None
            break

    clean_tables = pd.concat(clean_tables)
    return clean_tables

In [95]:
%%time
sheet_tables = {}
for k,v in sheets.items():
    table = extract_tables(v)
    sheet_tables[k] = table

100%|██████████| 40/40 [00:05<00:00,  6.88.xls/s]
100%|██████████| 40/40 [00:05<00:00,  7.17.xls/s]
100%|██████████| 40/40 [00:05<00:00,  7.62.xls/s]
100%|██████████| 40/40 [00:05<00:00,  7.60.xls/s]


CPU times: user 21.4 s, sys: 62.7 ms, total: 21.5 s
Wall time: 22 s


## Refactor final tables

In [120]:
v

Unnamed: 0,Rent Paid,Interest Paid,Depreciation,Net Income,Net Value Added,Gross Value Added,Net Fixed Capital Formation,year,ind,state
0,,,,,,,,1998 - 1999,383 - materials recovery,Chhattisgarh
1,,,,,,,,1998 - 1999,"581 - publishing of books, periodicals and oth...",Chhattisgarh
2,1162.0,5647.0,3936.0,134.0,6942.0,10878.0,3635.0,1998 - 1999,Others,Chhattisgarh
3,7624.0,85006.0,53656.0,226834.0,319464.0,373120.0,19704.0,1998 - 1999,Total,Chhattisgarh
4,,,,,,,,1999 - 2000,383 - materials recovery,Chhattisgarh
...,...,...,...,...,...,...,...,...,...,...
1020,49.0,5534.0,6034.0,24543.0,30126.0,36160.0,3009.0,2021 - 2022,"162 - manufacture of products of wood, cork, s...",Kerala
1021,35.0,912.0,1653.0,40903.0,41849.0,43502.0,-173.0,2021 - 2022,170 - manufacture of paper and paper products,Kerala
1022,2.0,3089.0,13718.0,53122.0,56213.0,69932.0,-9912.0,2021 - 2022,181 - printing and service activities related ...,Kerala
1023,,,,,,,,2021 - 2022,182 - reproduction of recorded media,Kerala


In [125]:
df_final = sheet_tables[list(sheet_tables.keys())[0]]
for k,v in sheet_tables.items():
    if k!= list(sheet_tables.keys())[0]:
        df_final = pd.merge(
            left=df_final,
            right=v,
            on=['year', 'ind', 'state'],
            how='left'
        )

df_final['Rent Paid'] = df_final['Rent Paid_x'] + df_final['Rent Paid_y']
df_final = df_final.drop(['Rent Paid_x', 'Rent Paid_y'], axis=1)
df_final = df_final.drop_duplicates()
print(df_final.shape)

(29704, 29)


In [126]:
index_columns = ['year','state', 'ind']
variable_columns = [c for c in df_final.columns if c not in index_columns]
index_columns.extend(variable_columns)
df_final = df_final[index_columns]

## Check for duplicates

In [113]:
df_final.duplicated().sum()

0

In [107]:
# df_final[['year','ind', 'state']].value_counts()

In [108]:
# df_final.loc[
#     (df_final['ind']=='142 - manufacture of articles of fur') &
#     (df_final['year']=='2003 - 2004') &
#     (df_final['state'] == 'Kerala')
# ]

## Export the table

In [114]:
df_final.to_excel('01_Annual Survey of Industries-State Level.xlsx', index=False)