In [None]:
import os
import glob
import pickle
import pandas as pd
from tqdm.notebook import tqdm

data_dir = "data"
xls_dir = "xls"

raw_prefix = "test"
processed_prefix = "ind"


xls_paths = glob.glob(os.path.join(xls_dir, "*.xls"))

os.makedirs(data_dir, exist_ok=True)

raw = dict()
processed = dict()

for i, file_path in enumerate(tqdm(xls_paths, desc="Files")):
    xls_file = pd.ExcelFile(file_path)
    xls_file_id = int(file_path.split('.')[0][-1])
    sheet_names = [sheet_name for sheet_name in xls_file.sheet_names if sheet_name.lower().startswith((raw_prefix, processed_prefix))]
    for sheet_name in tqdm(sheet_names, desc="Sheets", leave=False):
        try:
            sheet_id = int(sheet_name[-3:])
            id = f'{(sheet_id + 50 * (xls_file_id - 1)):03d}'
            if sheet_name.lower().startswith(raw_prefix):
                sample_df = pd.read_excel(file_path, sheet_name=sheet_name, header=0, skiprows=[1])
                data = sample_df.iloc[1:].reset_index(drop=True)
                raw[id] = data
            elif sheet_name.lower().startswith(processed_prefix):
                sample_df = pd.read_excel(file_path, sheet_name=sheet_name, header=0)
                #info = sample_df.iloc[0, -11:].to_frame().T
                data = sample_df.iloc[1:].reset_index(drop=True)
                processed[id] = data
        except ValueError:
            print(f"Skipping sheet {sheet_name} in file {file_path}")
            pass


with open(f'{data_dir}/processed.pkl', 'wb') as f:
    pickle.dump(processed, f)

with open(f'{data_dir}/raw.pkl', 'wb') as f:
    pickle.dump(raw, f)