In [1]:
import pandas as pd
import numpy as np
import janitor
import pickle
import os
import operator
import functools
import itertools
from typing import Callable

import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
if 'raw_databases.pkl' not in os.listdir('./data/'):
    from quickstart.loader import XlsxDriveLoader

    Loads = XlsxDriveLoader()  # Drive Folder is hardcoded in module, since this is not prone to change
    databases = Loads.content  # Process takes approximately ~3 minutes to run for the first time.
                               # Then the file will be stored in the data/
else:
    with open('./data/raw_databases.pkl', 'rb') as file:
        databases = pickle.load(file)

In [3]:
(db_name_contaminantes, db_name_meteoorologicas, 
db_namea_notaciones, db_name_estaciones) = databases.keys()


get_col_names: Callable[[pd.core.frame.DataFrame], pd.core.indexes.base.Index] = (
                         lambda df: sorted(list(df.columns)))

def shared_structure(db_name: str) -> list:
    """Returns sheets in Dict of DataFerames that conain the same fields"""

    df_col_names = [[key, get_col_names(val)] for key, val in databases[db_name].items()]

    grouped = list()
    for _, g in itertools.groupby(df_col_names, operator.itemgetter(1)):
        group = list(g) 
        if len(group) > 1: # Can't be performed with comprehension since g is a generator
            grouped.append(group)
    
    if len(grouped) == 1: 
        return [sheet_name for sheet_name, _ in functools.reduce(operator.iconcat, grouped, [])] 
        # Expecting there is only one shared_strcuture across the DataFrames
    else:
        return ... # Not necessary

In [4]:
contaminantes_ = pd.concat([databases[db_name_contaminantes][sheet] 
                            for sheet in shared_structure(db_name_contaminantes)])

meteorologicas_ = pd.concat([databases[db_name_meteoorologicas][sheet] 
                            for sheet in shared_structure(db_name_meteoorologicas)])

estaciones_ = pd.concat([databases[db_name_estaciones][sheet] 
                            for sheet in shared_structure(db_name_estaciones)])

In [5]:
fields = ['parametro', 'Fecha',
          'SO2', 'SO2 b'] # San Pedro Ids

def munge(df: pd.core.frame.DataFrame, fields: list):
    return (
        df
        .loc[:, fields]
        .pivot(index='Fecha', columns='parametro', values=['SO2', 'SO2 b'])
        # Apply dictionary into df
        .pipe(janitor.clean_names, strip_underscores='r')
        # Fix dtypes
    )

In [6]:
contaminantes = munge(contaminantes_, fields)
meteorologicas = munge(meteorologicas_, fields)

In [7]:
# There is a Timestamp missing in df metereologicas
set(contaminantes.index).difference(set(meteorologicas.index))

{Timestamp('2019-05-15 20:00:00')}

In [8]:
df = contaminantes.merge(meteorologicas,
                         left_index=True, right_index=True,
                         how='outer')

In [9]:
df

Unnamed: 0_level_0,so2,so2,so2,so2,so2,so2,so2,so2,so2,so2,...,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b
parametro,co,no,no,no2,no2,nox,nox,o3,o3,pm10,...,prs,rainf,rh,rh,sr,tout,wdr,wdr,wsr,wsr
Fecha,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-01-01 00:00:00,1.1,,17.0,,20.1,,37.0,,6.0,163.0,...,,,,,l,,,,,
2017-01-01 01:00:00,1.05,,19.5,,25.1,,44.5,,5.0,117.0,...,,,,,l,,,,,
2017-01-01 02:00:00,0.91,,16.7,,22.5,,39.0,,5.0,46.0,...,,,,,l,,,,,
2017-01-01 03:00:00,0.75,,12.5,,16.8,,29.2,,8.0,36.0,...,,,,,l,,,,,
2017-01-01 04:00:00,0.59,,6.5,,6.7,,13.1,,12.0,25.0,...,,,,,l,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-30 19:00:00,0.67,4.4,,,11.3,,15.7,,17.0,36.0,...,,,,,,,,,,
2021-06-30 20:00:00,0.66,,3.9,,10.7,,14.6,,16.0,41.0,...,,,,,,,,,,
2021-06-30 21:00:00,0.64,,3.4,,11.7,,15.1,,15.0,16.0,...,,,,,l,,,,,
2021-06-30 22:00:00,0.61,,3.4,,9.7,,13.1,,16.0,31.0,...,,,,,l,,,,,


In [10]:
with open('data/dataframe.pkl', 'wb') as file:
            pickle.dump(df, file)