# Pandas Data Aggregation

`.groupby()`, `.agg()`

KPI Library: https://bernardmarr.com/kpi-library/

In [None]:
import numpy as np
import pandas as pd

__As always we explore our dataset__

[Original dataset](https://datos.madrid.es/portal/site/egob/menuitem.c05c1f754a33a9fbe4b2e4b284f1a5a0/?vgnextoid=67663c0a55e16710VgnVCM1000001d4a900aRCRD&vgnextchannel=374512b9ace9f310VgnVCM100000171f5a0aRCRD&vgnextfmt=default)

In [None]:
%%time

parking = pd.read_csv()
#parking = pd.read_parquet()

In [None]:
# Dimension tables 

distritos = pd.read_csv('./datasets/distritos.csv')
barrios = pd.read_csv('./datasets/barrios.csv')

In [None]:
print(parking.info())
parking.head()

In [None]:
print(distritos.info())
distritos.head()

In [None]:
print(barrios.info())
barrios.head()

In [None]:
parking.isnull().sum()

__Time span__

In [None]:
%%time

parking['fecha_operacion'] = pd.to_datetime(parking['fecha_operacion'], format="%Y-%m-%d %H:%M:%S")

In [None]:
parking['fecha_operacion'].max() - parking['fecha_operacion'].min()

__Data Manipulation__

In [None]:
%%time

p_distritos = pd.merge(parking, distritos)
p_distritos.head()

In [None]:
# Always check the results!!!

just_checking = p_distritos[p_distritos['cod_distrito'].isin([5])]['distrito']
just_checking.unique()

---

## The `df.groupby()` object

In [None]:
%%time

groupby_object = p_distritos.groupby(['distrito'])
groupby_object

## The `.agg()` method

In [None]:
%%time

total_distritos = groupby_object[['minutos_tique',
                                  'importe_tique']]\
                  .sum()#.sort_values(by='importe_tique',
                                     #ascending=False).reset_index()
total_distritos

---

In [None]:
# Another aggregation

media_distritos = p_distritos.groupby(['distrito'])[['minutos_tique',
                                                     'importe_tique']]\
                  .mean()#.sort_values(by='importe_tique',
                                      #ascending=False).reset_index()
media_distritos

In [None]:
#Multiple agg functions for all columns

stats_distritos = p_distritos.groupby(['distrito'])[['minutos_tique',
                                                     'importe_tique']]\
                  .agg(['min', 'max', 'mean', 'median', 'std'])\
                  .sort_values(by=('importe_tique',
                                   'median'), ascending=False)
stats_distritos

In [None]:
stats_distritos.index

In [None]:
stats_distritos.columns

In [None]:
# Multiple agg functions for different columns

multistats_distritos = p_distritos.groupby(['distrito'])[['minutos_tique',
                                                          'importe_tique']]\
                       .agg(mean_importe=('importe_tique', 'mean'),
                            std_importe=('importe_tique', 'std'),
                            median_minutos=('minutos_tique', 'median')).sort_values(by='median_minutos',
                                                                                    ascending=False)
multistats_distritos

In [None]:
# Numpy agg function (e.g.: https://numpy.org/doc/stable/reference/generated/numpy.ptp.html)

numpy_distritos = p_distritos.groupby(['distrito'])[['importe_tique']]\
                  .agg(np.ptp).sort_values(by='importe_tique', ascending=False)
numpy_distritos

In [None]:
#Custom agg function

def custom(col):
    return f'{round((np.sum(col)*100)/ minutos, 2)} centimos/hora'

In [None]:
minutos = p_distritos['minutos_tique'].sum() / 60

custom_distritos = p_distritos.groupby(['distrito'])[['importe_tique']]\
                   .agg(custom).sort_values(by='importe_tique', ascending=False)
custom_distritos

---

### Now your turn to practice performing the same excercise by _'Barrio'_...

![Image](./img/etl_pandas_agg_01.jpg)