<a target="_blank" href="https://colab.research.google.com/github/kavyajeetbora/ETL_wages/blob/master/ETL_wages_data.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
# @title Setup the environment and Upload files

# @markdown Run this only once otherwise restart the session
!pip install aspose-cells
import  jpype
jpype.startJVM()
from asposecells.api import Workbook
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import os
from google.colab import files

def convert_xl2xlsx(filename):
    '''
    Convert xls file to xlsx
    '''
    if os.path.isfile(filename):
        workbook = Workbook(filename)
        export_filename = f"{os.path.basename(filename)}.xlsx"

        if os.path.isfile(export_filename):
            os.remove(export_filename)

        workbook.save(export_filename)
        return export_filename
    else:
        return None

def preprocess_data(xlsx_file):
    '''
    Preprocessing the dataframe:
    - setting the row id for table header
    - Setting the year column as table index
    - Limiting the table to the last year, current 18th row number
    '''
    df = pd.read_excel(xlsx_file, header=[4,5], nrows=18)
    df = df.drop(index=[0,1],axis=0).reset_index(drop=True)
    df = df.set_index(df.columns[0])
    df.index.names = ['Year']
    # df.tail()
    return df

def unpivot_table(df):
    '''
    Unpivot the table

    '''
    df_out = {'year':[],'sector':[]}
    sectors = []
    years = []

    for cols in df.columns:
        i1,i2 = map(str.strip, cols)
        values = list(df.loc[:,cols].values)

        if i1 not in df_out.keys():
            df_out.setdefault(i1,values)
        else:
            df_out[i1].extend(values)

        i2 = i2.split("-")[0].strip()
        if i2 not in sectors:
            df_out['sector'].extend([i2]*len(values))
            df_out['year'].extend(df.index)
        sectors.append(i2)

    df_out = pd.DataFrame(df_out)
    return df_out

def post_process_df(df):
    df_out = df.copy()
    df_out = df_out.replace(r'^\s*$', float('nan'), regex=True)
    for col in df_out.columns[2:]:
        df_out[col] = df_out[col].astype(float)
    df_out['sector']=df_out['sector'].astype(int)
    return df_out

Collecting aspose-cells
  Downloading aspose-cells-24.3.0.tar.gz (14.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.3/14.3 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting JPype1>=1.2.1 (from aspose-cells)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: aspose-cells
  Building wheel for aspose-cells (setup.py) ... [?25l[?25hdone
  Created wheel for aspose-cells: filename=aspose_cells-24.3.0-py3-none-any.whl size=14226572 sha256=229434df03fda22149f091f0f8a12fed2ff685d933f978a94d432c9e8e7310f0
  Stored in directory: /root/.cache/pip/wheels/bf/10/47/f3d06fe998b7fc334dbd7561c7ba6204e8ae612a35d7850cb4
Successfully built aspose-cells
Installing collected packages: JPype1, aspose-cells
Succes

In [2]:
# @title 1. Upload files
## Clear all existing files
_ = [os.remove(x) for x in os.listdir() if os.path.isfile(x)]

# Prompt user to upload a folder
uploaded = files.upload()

Saving 200_224_wages_1970.xls to 200_224_wages_1970.xls
Saving 250_274_wages.xls to 250_274_wages.xls
Saving 275_299_wages.xls to 275_299_wages.xls
Saving 300_324_wages.xls to 300_324_wages.xls
Saving 325_349_wages.xls to 325_349_wages.xls
Saving 350_374_wages.xls to 350_374_wages.xls
Saving 375_rest_wages.xls to 375_rest_wages.xls


In [3]:
# @title 2. Process the data
## ETL

xl_files = glob("*.xls")

dfs = []
for xl_file in tqdm(xl_files, unit=' xl file'):
    try:
        xlsx_file = convert_xl2xlsx(xl_file)

        df = preprocess_data(xlsx_file)
        df_out = unpivot_table(df)
        df_out=post_process_df(df_out)
        dfs.append(df_out)
    except Exception as e:
        print(xl_file,e)

  0%|          | 0/7 [00:00<?, ? xl file/s]

200_224_wages_1970.xls header index 4 exceeds maximum index 0 of data.


In [4]:
# @title 3. Display Output
df_final = pd.concat(dfs)
df_final.sample(5)

Unnamed: 0,year,sector,Fixed Capital,Number of Workers,Wages and Salaries - Workers,Gross Value Added,Wages and Salaries - Total
278,1979 - 1980,741,2961.0,7644.0,233.0,1194.0,
182,1979 - 1980,266,58.0,503.0,24.0,111.0,
240,1973 - 1974,410,1508.0,2877.0,135.0,587.0,
202,1983 - 1984,288,276.0,1758.0,113.0,407.0,
302,1987 - 1988,273,823.0,2701.0,202.0,858.0,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [5]:
# @title 4. Export the file
## Clear all existing files once again

_ = [os.remove(x) for x in os.listdir() if os.path.isfile(x)]
df_final.to_excel("output.xlsx")
files.download('output.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

References

1. [Apose cells](https://products.aspose.com/cells/python-java/conversion/xls-to-xlsx/)
2. [Google forms](https://colab.research.google.com/notebooks/forms.ipynb)