In [1]:
import pandas as pd
import numpy as np
import janitor
import unidecode
import pickle
import os
import operator
import functools
import itertools
from typing import Callable


In [2]:
if 'databases.pkl' not in os.listdir('./raw-data/'):
    from quickstart.loader import XlsxDriveLoader

    Loads = XlsxDriveLoader()  # Drive Folder is hardcoded in module, since this is not prone to change
    databases = Loads.content  # Process takes approximately ~3 minutes to run for the first time.
                               # Then the file will be stored in the data/
else:
    with open('./raw-data/databases.pkl', 'rb') as file:
        databases = pickle.load(file)
    print("Loaded data from .pkl file")
    

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=438184844244-uhk0l93iq5rfack31hmfbp1ted62o9o3.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.
Data was correctly allocated in memory


In [3]:
list(databases.keys())


['Anotaciones sobre banderas de contaminantes.xlsx',
 'BD_Tec_Banderas_2018_2021_3Estaciones.xlsx',
 'BD_Tec_Banderas_meteo.xlsx',
 'BD_Tec_banderas_contaminantes.xlsx']

In [4]:
(db_name_anotaciones, db_name_meteoorologicas,
 db_name_meteoorologicas, db_name_contaminantes) = databases.keys()


get_col_names: Callable[[pd.core.frame.DataFrame], pd.core.indexes.base.Index] = (
    lambda df: sorted(list(df.columns)))


def shared_structure(db_name: str) -> list:
    """Returns sheets in dict of DataFrames that conain the same fields"""

    df_col_names = [[key, get_col_names(val)]
                        for key, val in databases[db_name].items()]

    grouped = list()
    for _, g in itertools.groupby(df_col_names, operator.itemgetter(1)):
        group = list(g)
        if len(group) > 1:  # Can't be performed with comprehension since g is a generator/iterator
            grouped.append(group)

    if len(grouped) == 1:
        return [sheet_name for sheet_name, _ in functools.reduce(operator.iconcat, grouped, [])]
        # Expecting there is only one shared_structure across the DataFrames
    else:
        return ...  # Not necessary


In [5]:
raw_contaminantes = pd.concat([databases[db_name_contaminantes][sheet]
                               for sheet in shared_structure(db_name_contaminantes)])

raw_meteorologicas = pd.concat([databases[db_name_meteoorologicas][sheet]
                                for sheet in shared_structure(db_name_meteoorologicas)])


In [6]:
# Dictionary for categorical values

identifier = (
    databases[db_name_anotaciones]['LEEME']
    .loc[:, ['Flag', 'Hora']]
    .set_index('Flag')
    .dropna(axis='index')
    .squeeze()
    .apply(lambda string: unidecode.unidecode(string).strip().lower())
    .apply(lambda validity: True if validity == 'valida' else False)
    .to_dict()
)

identifier |= {'x': False}  # Record does not appear on DataFrame


def f_identifier(dict_): return identifier.get(dict_, True)

# Dictionary for mesaurement units

measurement_units = (
    databases[db_name_anotaciones]['Hoja1']
    .iloc[23:41, [0, 2]]
    .dropna()
    .drop(33)
    .pivot(columns='Notas a considerar:', values='Unnamed: 2')
    .pipe(janitor.clean_names, remove_special=True)
    .mode()
    .squeeze()
    .to_dict()
)

with open('data/measurement-units.pkl', 'wb') as file:
    pickle.dump(measurement_units, file) # requiered for plotting



In [7]:
id = 'SO2' # San Pedro identifier
ids = [f'{id}', f'{id} b'] 

fields = ['parametro', 'Fecha']
fields.extend(ids)


def munge(df: pd.core.frame.DataFrame, fields: list):
    return (
        df
        .loc[:, fields]
        .pipe(janitor.rename_columns, new_column_names={'parametro': 'factor',
                                                        'Fecha': 'date'})
        .pipe(janitor.process_text, column_name='factor', string_function='strip')
        .pivot(index='date', columns='factor', values=ids)
        .convert_dtypes(convert_integer=False)
        .apply(lambda field: field.where(field > 0, pd.NA) if pd.api.types.is_numeric_dtype(field)
               else field.map(f_identifier))  # Remove negative values from numeric fields
                                              # and map str typed variables with a dict
        .pipe(janitor.clean_names, strip_underscores='r')
    )


In [8]:
contaminantes = munge(raw_contaminantes, fields)
meteorologicas = munge(raw_meteorologicas, fields)


In [9]:
set(contaminantes.index).difference(set(meteorologicas.index))


{Timestamp('2019-05-15 20:00:00')}

There is a Timestamp missing in `metereologicas`

In [10]:
df_ = contaminantes.join(other=meteorologicas,
                         how='outer')


In [11]:
vals, flags = df_.columns.droplevel(1).unique()

df = (
    df_
    .apply(lambda row: row[vals].where(row[flags].astype('bool')), axis=1) # check validity
    .replace({pd.NA: np.nan}) # .astype() does not operate when having diferent dtypes for
                              # missing values
    .astype(float)
    .drop_duplicates()
)


Dropping outliers is not as straightforward as it may be in other types of analysis, since we are trying to understand when this spikes on pollution occur. Likewise, imputation for missing values is more than replacing `pd.NA`'s with the mean/median for every column, because we are dealing with a time-series. Data visualization will proceed, to understand the behaviour of data and apply the most adequate procedures for outliers and missing values 

In [12]:
with open('raw-data/san-pedro-201701-202106.pkl', 'wb') as file:
    pickle.dump(df, file)


Repeat procedure for region: Centro

In [13]:
id = 'CE' # Centro identifier
ids = [f'{id}', f'{id} b'] 

fields = ['parametro', 'Fecha']
fields.extend(ids)

contaminantes = munge(raw_contaminantes, fields)
meteorologicas = munge(raw_meteorologicas, fields)

df_ = contaminantes.join(other=meteorologicas,
                         how='outer')

vals, flags = df_.columns.droplevel(1).unique()

df = (
    df_
    .apply(lambda row: row[vals].where(row[flags].astype('bool')), axis=1) # check validity
    .replace({pd.NA: np.nan}) # .astype() does not operate when having diferent dtypes for
                              # missing values
    .astype(float)
    .drop_duplicates()
)


In [14]:
with open('raw-data/centro-201701-202106.pkl', 'wb') as file:
    pickle.dump(df, file)
