<a href="https://colab.research.google.com/github/kavyajeetbora/ETL_wages/blob/master/ETL_wages_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Setup the environment and Upload files

# @markdown Run this only once otherwise restart the session
!pip install aspose-cells
import  jpype
jpype.startJVM()
from asposecells.api import Workbook
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import os
from google.colab import files

def convert_xl2xlsx(filename):
    if os.path.isfile(filename):
        workbook = Workbook(filename)
        export_filename = f"{os.path.basename(filename)}.xlsx"

        if os.path.isfile(export_filename):
            os.remove(export_filename)

        workbook.save(export_filename)
        return export_filename
    else:
        return None

def preprocess_data(xlsx_file):
    df = pd.read_excel(xlsx_file, header=[4,5], nrows=18)
    df = df.drop(index=[0,1],axis=0).reset_index(drop=True)
    df = df.set_index(df.columns[0])
    df.index.names = ['Year']
    # df.tail()
    return df

def unpivot_table(df):

    df_out = {'year':[],'sector':[]}
    sectors = []
    years = []

    for cols in df.columns:
        i1,i2 = map(str.strip, cols)
        values = list(df.loc[:,cols].values)

        if i1 not in df_out.keys():
            df_out.setdefault(i1,values)
        else:
            df_out[i1].extend(values)

        i2 = i2.split("-")[0].strip()
        if i2 not in sectors:
            df_out['sector'].extend([i2]*len(values))
            df_out['year'].extend(df.index)
        sectors.append(i2)

    df_out = pd.DataFrame(df_out)
    return df_out

def post_process_df(df):
    df_out = df.copy()
    df_out = df_out.replace(r'^\s*$', float('nan'), regex=True)
    for col in df_out.columns[2:]:
        df_out[col] = df_out[col].astype(float)
    df_out['sector']=df_out['sector'].astype(int)
    return df_out

In [None]:
# @title 1. Upload files
## Clear all existing files
_ = [os.remove(x) for x in os.listdir() if os.path.isfile(x)]

# Prompt user to upload a folder
uploaded = files.upload()

In [None]:
# @title 2. Process the data
## ETL

xl_files = glob("*.xls")

dfs = []
for xl_file in tqdm(xl_files, unit=' xl file'):
    try:
        xlsx_file = convert_xl2xlsx(xl_file)

        df = preprocess_data(xlsx_file)
        df_out = unpivot_table(df)
        df_out=post_process_df(df_out)
        dfs.append(df_out)
    except Exception as e:
        print(xl_file,e)

In [None]:
# @title 3. Display Output
df_final = pd.concat(dfs)
df_final.sample(5)

In [4]:
# @title 4. Export the file
## Clear all existing files once again

_ = [os.remove(x) for x in os.listdir() if os.path.isfile(x)]
df_final.to_excel("output.xlsx")
files.download('output.xlsx')

References

1. [Apose cells](https://products.aspose.com/cells/python-java/conversion/xls-to-xlsx/)
2. [Google forms](https://colab.research.google.com/notebooks/forms.ipynb)