In [1]:
import pandas as pd
import numpy as np
import janitor
import unidecode
import pickle
import os
import operator
import functools
import itertools
from typing import Callable

import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
if 'raw_databases.pkl' not in os.listdir('./data/'):
    from quickstart.loader import XlsxDriveLoader

    Loads = XlsxDriveLoader()  # Drive Folder is hardcoded in module, since this is not prone to change
    databases = Loads.content  # Process takes approximately ~3 minutes to run for the first time.
                               # Then the file will be stored in the data/
else:
    with open('./data/raw_databases.pkl', 'rb') as file:
        databases = pickle.load(file)
    print("Loaded data from .pkl file")

Loaded data from .pkl file


In [3]:
(db_name_contaminantes, db_name_meteoorologicas, 
db_name_anotaciones, _) = databases.keys()


get_col_names: Callable[[pd.core.frame.DataFrame], pd.core.indexes.base.Index] = (
                         lambda df: sorted(list(df.columns)))

def shared_structure(db_name: str) -> list:
    """Returns sheets in dict of DataFrames that conain the same fields"""

    df_col_names = [[key, get_col_names(val)] for key, val in databases[db_name].items()]

    grouped = list()
    for _, g in itertools.groupby(df_col_names, operator.itemgetter(1)):
        group = list(g) 
        if len(group) > 1: # Can't be performed with comprehension since g is a generator/iterator
            grouped.append(group)
    
    if len(grouped) == 1: 
        return [sheet_name for sheet_name, _ in functools.reduce(operator.iconcat, grouped, [])] 
        # Expecting there is only one shared_structure across the DataFrames
    else:
        return ... # Not necessary

In [4]:
raw_contaminantes = pd.concat([databases[db_name_contaminantes][sheet] 
                            for sheet in shared_structure(db_name_contaminantes)])

raw_meteorologicas = pd.concat([databases[db_name_meteoorologicas][sheet] 
                            for sheet in shared_structure(db_name_meteoorologicas)])

In [5]:
# Dictionary for categorical values

identifier = (
    databases[db_name_anotaciones]['LEEME']
    .loc[:, ['Flag', 'Hora']]
    .set_index('Flag')
    .dropna(axis='index')
    .squeeze()
    .apply(lambda string: unidecode.unidecode(string).strip().lower())
    .apply(lambda validity: True if validity == 'valida' else False)
    .append(pd.Series({'x': False})) # Record does not appear on DataFrame
    .to_dict()
)

f_identifier = lambda dict_: identifier.get(dict_, True)

  databases[db_name_anotaciones]['LEEME']


In [6]:
fields = ['parametro', 'Fecha',
          'SO2', 'SO2 b']  # San Pedro identifiers


def munge(df: pd.core.frame.DataFrame, fields: list):
    return (
        df
        .loc[:, fields]
        .pipe(janitor.rename_columns, new_column_names={'parametro': 'factor',
                                                        'Fecha': 'date'})
        .pipe(janitor.process_text, column_name='factor', string_function='strip')
        .pivot(index='date', columns='factor', values=['SO2', 'SO2 b'])
        .convert_dtypes()
        .apply(lambda field: field.where(field > 0, pd.NA) if pd.api.types.is_numeric_dtype(field)
                                    else field.map(f_identifier))  # Remove negative values from numeric fields
                                                                 # And map str typed variables with a dict
        .pipe(janitor.clean_names, strip_underscores='r')
    )


In [7]:
contaminantes = munge(raw_contaminantes, fields)
meteorologicas = munge(raw_meteorologicas, fields)

In [8]:
# There is a Timestamp missing in df metereologicas
set(contaminantes.index).difference(set(meteorologicas.index))

{Timestamp('2019-05-15 20:00:00')}

In [9]:
df_ = contaminantes.join(other=meteorologicas,
                        how='outer')

In [10]:
df_

Unnamed: 0_level_0,so2,so2,so2,so2,so2,so2,so2,so2,so2_b,so2_b,...,so2,so2,so2,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b
factor,co,no,no2,nox,o3,pm10,pm2_5,so2,co,no,...,tout,wdr,wsr,prs,rainf,rh,sr,tout,wdr,wsr
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-01-01 00:00:00,1.1,17.0,20.1,37.0,6,163,,5.0,True,True,...,20.68,253,4.0,True,True,True,False,True,True,True
2017-01-01 01:00:00,1.05,19.5,25.1,44.5,5,117,,3.9,True,True,...,21.59,244,5.4,True,True,True,False,True,True,True
2017-01-01 02:00:00,0.91,16.7,22.5,39.0,5,46,,3.5,True,True,...,20.94,260,4.4,True,True,True,False,True,True,True
2017-01-01 03:00:00,0.75,12.5,16.8,29.2,8,36,,3.4,True,True,...,20.99,257,3.7,True,True,True,False,True,True,True
2017-01-01 04:00:00,0.59,6.5,6.7,13.1,12,25,,3.2,True,True,...,20.89,84,2.0,True,True,True,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-30 19:00:00,0.67,4.4,11.3,15.7,17,36,14.0,3.5,True,True,...,22.78,69,27.1,True,True,True,True,True,True,True
2021-06-30 20:00:00,0.66,3.9,10.7,14.6,16,41,7.0,3.6,True,True,...,23.57,74,97.5,True,True,True,True,True,True,True
2021-06-30 21:00:00,0.64,3.4,11.7,15.1,15,16,7.0,3.4,True,True,...,22.89,79,117.0,True,True,True,False,True,True,True
2021-06-30 22:00:00,0.61,3.4,9.7,13.1,16,31,6.0,3.1,True,True,...,22.72,89,117.7,True,True,True,False,True,True,True


In [11]:
df_.describe(include="all").T

Unnamed: 0_level_0,Unnamed: 1_level_0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Unnamed: 0_level_1,factor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
so2,co,29973.0,,,,1.679246,1.025919,0.01,0.8,1.5,2.39,5.79
so2,no,31723.0,,,,9.118173,8.916451,0.5,3.2,8.2,11.0,161.0
so2,no2,32607.0,,,,11.03015,7.8801,0.1,5.7,8.8,14.3,82.3
so2,nox,32626.0,,,,19.729188,13.361774,0.5,12.3,16.9,23.3,198.2
so2,o3,37936.0,,,,25.389788,19.092307,1.0,10.0,22.0,35.0,153.0
so2,pm10,36946.0,,,,60.021545,35.906347,2.0,37.0,52.0,75.0,714.0
so2,pm2_5,20458.0,,,,18.928187,13.211943,2.0,9.0,16.0,25.0,156.0
so2,so2,29466.0,,,,4.205708,2.24159,0.5,2.9,3.6,4.8,67.4
so2_b,co,39394.0,2.0,True,30073.0,,,,,,,
so2_b,no,39394.0,2.0,True,31704.0,,,,,,,


In [12]:
# Replace so2 records to NaN where so2_b values are False.

df = (
    df_
    .apply(lambda row: row.so2.where(row.so2_b.astype('bool')), axis=1) # Corrupts data types
    .replace({pd.NA: np.nan}) # convert_dtypes does not operate when having diferent dtypes for
                              # missing values
    .convert_dtypes()
)

In [13]:
df.describe().T

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
co,29930.0,1.680393,1.026013,0.01,0.8,1.5,2.39,5.79
no,31699.0,9.11936,8.919003,0.5,3.2,8.2,11.0,161.0
no2,32588.0,11.030517,7.881935,0.1,5.7,8.8,14.3,82.3
nox,32604.0,19.730637,13.365013,0.5,12.3,16.9,23.3,198.2
o3,37917.0,25.395284,19.093992,1.0,10.0,22.0,35.0,153.0
pm10,36919.0,60.000433,35.892953,2.0,37.0,52.0,75.0,714.0
pm2_5,20455.0,18.930229,13.211835,2.0,9.0,16.0,25.0,156.0
so2,29447.0,4.205593,2.241888,0.5,2.9,3.6,4.8,67.4
prs,38527.0,713.374127,1.658729,702.4,712.3,713.2,714.3,738.9
rainf,1199.0,0.210325,1.385718,0.01,0.01,0.02,0.06,17.99


In [14]:
with open('data/dataframe.pkl', 'wb') as file:
            pickle.dump(df, file)