In [1]:
import pandas as pd
import numpy as np
import janitor
import pickle
import os
import operator
import functools
import itertools
from typing import Callable

import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
if 'raw_databases.pkl' not in os.listdir('./data/'):
    from quickstart.loader import XlsxDriveLoader

    Loads = XlsxDriveLoader()  # Drive Folder is hardcoded in module, since this is not prone to change
    databases = Loads.content  # Process takes approximately ~3 minutes to run for the first time.
                               # Then the file will be stored in the data/
else:
    with open('./data/raw_databases.pkl', 'rb') as file:
        databases = pickle.load(file)
    print("Loaded data from .pkl file")

Loaded data from .pkl file


In [3]:
(db_name_contaminantes, db_name_meteoorologicas, 
db_name_notaciones, db_name_estaciones) = databases.keys()


get_col_names: Callable[[pd.core.frame.DataFrame], pd.core.indexes.base.Index] = (
                         lambda df: sorted(list(df.columns)))

def shared_structure(db_name: str) -> list:
    """Returns sheets in dict of DataFrames that conain the same fields"""

    df_col_names = [[key, get_col_names(val)] for key, val in databases[db_name].items()]

    grouped = list()
    for _, g in itertools.groupby(df_col_names, operator.itemgetter(1)):
        group = list(g) 
        if len(group) > 1: # Can't be performed with comprehension since g is a generator/iterator
            grouped.append(group)
    
    if len(grouped) == 1: 
        return [sheet_name for sheet_name, _ in functools.reduce(operator.iconcat, grouped, [])] 
        # Expecting there is only one shared_structure across the DataFrames
    else:
        return ... # Not necessary

In [4]:
raw_contaminantes = pd.concat([databases[db_name_contaminantes][sheet] 
                            for sheet in shared_structure(db_name_contaminantes)])

raw_meteorologicas = pd.concat([databases[db_name_meteoorologicas][sheet] 
                            for sheet in shared_structure(db_name_meteoorologicas)])

# db_name_estaciones did not include information about our corresponding zone

In [5]:
fields = ['parametro', 'Fecha',
          'SO2', 'SO2 b'] # San Pedro identifiers

def munge(df: pd.core.frame.DataFrame, fields: list):
    return (
        df
        .loc[:, fields]
        .pipe(janitor.rename_columns, new_column_names={'parametro': 'molecule',
                                                        'Fecha': 'date'})
        .pipe(janitor.process_text, column_name='molecule', string_function='strip')
        .pivot(index='date', columns='molecule', values=['SO2', 'SO2 b'])
        .convert_dtypes()
        # Apply dictionary into df
        .pipe(janitor.clean_names, strip_underscores='r')
    )

In [6]:
contaminantes = munge(raw_contaminantes, fields)
meteorologicas = munge(raw_meteorologicas, fields)

In [7]:
# There is a Timestamp missing in df metereologicas
set(contaminantes.index).difference(set(meteorologicas.index))

{Timestamp('2019-05-15 20:00:00')}

In [8]:
df = contaminantes.join(other=meteorologicas,
                        how='outer')

In [9]:
df

Unnamed: 0_level_0,so2,so2,so2,so2,so2,so2,so2,so2,so2_b,so2_b,...,so2,so2,so2,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b,so2_b
molecule,co,no,no2,nox,o3,pm10,pm2_5,so2,co,no,...,tout,wdr,wsr,prs,rainf,rh,sr,tout,wdr,wsr
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-01-01 00:00:00,1.1,17.0,20.1,37.0,6,163,,5.0,,,...,20.68,253,4.0,,,,l,,,
2017-01-01 01:00:00,1.05,19.5,25.1,44.5,5,117,,3.9,,,...,21.59,244,5.4,,,,l,,,
2017-01-01 02:00:00,0.91,16.7,22.5,39.0,5,46,,3.5,,,...,20.94,260,4.4,,,,l,,,
2017-01-01 03:00:00,0.75,12.5,16.8,29.2,8,36,,3.4,,,...,20.99,257,3.7,,,,l,,,
2017-01-01 04:00:00,0.59,6.5,6.7,13.1,12,25,,3.2,,,...,20.89,84,2.0,,,,l,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-30 19:00:00,0.67,4.4,11.3,15.7,17,36,14.0,3.5,,,...,22.78,69,27.1,,,,,,,
2021-06-30 20:00:00,0.66,3.9,10.7,14.6,16,41,7.0,3.6,,,...,23.57,74,97.5,,,,,,,
2021-06-30 21:00:00,0.64,3.4,11.7,15.1,15,16,7.0,3.4,,,...,22.89,79,117.0,,,,l,,,
2021-06-30 22:00:00,0.61,3.4,9.7,13.1,16,31,6.0,3.1,,,...,22.72,89,117.7,,,,l,,,


In [10]:
df.describe(include="all").T

Unnamed: 0_level_0,Unnamed: 1_level_0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Unnamed: 0_level_1,molecule,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
so2,co,29974.0,,,,1.345601,57.773094,-9999.0,0.8,1.5,2.39,5.79
so2,no,31724.0,,,,8.802698,56.892955,-9999.0,3.2,8.2,11.0,161.0
so2,no2,32615.0,,,,10.720868,55.985114,-9999.0,5.7,8.8,14.3,82.3
so2,nox,32627.0,,,,19.42212,57.052338,-9999.0,12.3,16.9,23.3,198.2
so2,o3,37937.0,,,,25.12555,54.893801,-9999.0,10.0,22.0,35.0,153.0
so2,pm10,36947.0,,,,59.74929,63.465377,-9999.0,37.0,52.0,75.0,714.0
so2,pm2_5,20461.0,,,,17.459354,122.015373,-9999.0,9.0,16.0,25.0,156.0
so2,so2,29468.0,,,,3.866082,58.315631,-9999.0,2.9,3.6,4.8,67.4
so2_b,co,9445.0,6.0,a,9278.0,,,,,,,
so2_b,no,7693.0,6.0,l,4476.0,,,,,,,


Looks like only `so2_b` content has categorical variables.

Next:
- Replace $-9999.0$ values with `np.nan`
- Map categorical data with a dictionary
- One-hot encode
- Imputation, outlier analysis and standarize/normalize
- Begin with statistical analysis

In [11]:
with open('data/dataframe.pkl', 'wb') as file:
            pickle.dump(df, file)