<a href="https://colab.research.google.com/github/jmbarrios/THC-Python/blob/main/20220120_Altair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import altair as alt

In [2]:
penguin_data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv')

In [3]:
penguin_data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [4]:
chart = alt.Chart(data=penguin_data)

In [7]:
chart.mark_circle().encode(
    x = 'bill_length_mm',
    y = 'bill_depth_mm',
    color = 'species'
)

In [10]:
chart.mark_circle().encode(
    alt.X('bill_length_mm', 
          type='quantitative', 
          title='Bill length (mm)', 
          scale=alt.Scale(domain=(30, 65))
          ),
    alt.Y('bill_depth_mm', 
          type='quantitative', 
          title='Bill depth (mm)', 
          scale=alt.Scale(domain=(10, 25))
          ),
    alt.Color('species',
              type='nominal',
              title='Species')
)

In [17]:
chart.mark_bar().encode(
    alt.Y('bill_length_mm', type='quantitative', aggregate='mean', title='Mean value of bill length'),
    alt.X('species', type='nominal', title='Species', sort='y'),
    alt.Color('species', type='nominal', legend=None)
)

# Contaminación en la CDMX

In [19]:
contaminacion_2021 = pd.read_csv('http://www.aire.cdmx.gob.mx/opendata/anuales_horarios_gz/contaminantes_2021.csv.gz',
                                 skiprows=10)

In [20]:
contaminacion_2021.head()

Unnamed: 0,date,id_station,id_parameter,value,unit
0,01/01/2021 01:00,ACO,CO,,15
1,01/01/2021 01:00,ACO,NO,,1
2,01/01/2021 01:00,ACO,NO2,,1
3,01/01/2021 01:00,ACO,NOX,,1
4,01/01/2021 01:00,ACO,O3,,1


In [28]:
PED_O3_levels = contaminacion_2021[
                                   (contaminacion_2021['id_station'] == 'PED') & 
                                   (contaminacion_2021['id_parameter'] == 'O3')].copy()

In [29]:
PED_O3_levels.head()

Unnamed: 0,date,id_station,id_parameter,value,unit
112,01/01/2021 01:00,PED,O3,8.0,1
373,01/01/2021 02:00,PED,O3,9.0,1
634,01/01/2021 03:00,PED,O3,16.0,1
895,01/01/2021 04:00,PED,O3,17.0,1
1156,01/01/2021 05:00,PED,O3,12.0,1


In [30]:
PED_O3_levels.dtypes

date             object
id_station       object
id_parameter     object
value           float64
unit              int64
dtype: object

In [31]:
d = PED_O3_levels['date']

In [32]:
d

112        01/01/2021 01:00
373        01/01/2021 02:00
634        01/01/2021 03:00
895        01/01/2021 04:00
1156       01/01/2021 05:00
                 ...       
2292507    31/12/2021 20:00
2292769    31/12/2021 21:00
2293031    31/12/2021 22:00
2293293    31/12/2021 23:00
2293555    31/12/2021 24:00
Name: date, Length: 8760, dtype: object

In [34]:
pd.to_datetime(d, format='%d/%m/%Y %H:%M')

ValueError: ignored

In [35]:
fecha_erronea = '01/01/2021 24:00'

In [45]:
def fix_24hr(dt):
    dt_fix = dt[:-5] + '00' + dt[-3:]
    return dt_fix

In [46]:
fix_24hr('01/01/2021 24:00')

'01/01/2021 00:00'

In [47]:
def fix_24hr_mine(dt):
    date, _ = dt.split(' ')
    return f'{date} 00:00'

In [48]:
fix_24hr_mine('01/01/2021 24:00')

'01/01/2021 00:00'

In [49]:
?d.where

In [50]:
d.str.contains('24:00')

112        False
373        False
634        False
895        False
1156       False
           ...  
2292507    False
2292769    False
2293031    False
2293293    False
2293555     True
Name: date, Length: 8760, dtype: bool

In [53]:
d.where(~d.str.contains('24:00'), lambda x: x.apply(fix_24hr))

112        01/01/2021 01:00
373        01/01/2021 02:00
634        01/01/2021 03:00
895        01/01/2021 04:00
1156       01/01/2021 05:00
                 ...       
2292507    31/12/2021 20:00
2292769    31/12/2021 21:00
2293031    31/12/2021 22:00
2293293    31/12/2021 23:00
2293555    31/12/2021 00:00
Name: date, Length: 8760, dtype: object

In [54]:
pd.to_datetime(
    d.where(~d.str.contains('24:00'), lambda x: x.apply(fix_24hr)), 
    format='%d/%m/%Y %H:%M')

112       2021-01-01 01:00:00
373       2021-01-01 02:00:00
634       2021-01-01 03:00:00
895       2021-01-01 04:00:00
1156      2021-01-01 05:00:00
                  ...        
2292507   2021-12-31 20:00:00
2292769   2021-12-31 21:00:00
2293031   2021-12-31 22:00:00
2293293   2021-12-31 23:00:00
2293555   2021-12-31 00:00:00
Name: date, Length: 8760, dtype: datetime64[ns]

In [55]:
PED_O3_levels['date'] = pd.to_datetime(
    d.where(~d.str.contains('24:00'), lambda x: x.apply(fix_24hr)), 
    format='%d/%m/%Y %H:%M')

In [58]:
PED_O3_levels.tail()

Unnamed: 0,date,id_station,id_parameter,value,unit
2292507,2021-12-31 20:00:00,PED,O3,16.0,1
2292769,2021-12-31 21:00:00,PED,O3,28.0,1
2293031,2021-12-31 22:00:00,PED,O3,19.0,1
2293293,2021-12-31 23:00:00,PED,O3,12.0,1
2293555,2021-12-31 00:00:00,PED,O3,6.0,1


In [57]:
PED_O3_levels.dtypes

date            datetime64[ns]
id_station              object
id_parameter            object
value                  float64
unit                     int64
dtype: object

In [52]:
def square(x):
    return x**2

square_2 = lambda x: x**2

print(square(2))
print(square(4))
print(square(3))

print(square_2(2))
print(square_2(4))
print(square_2(3))

4
16
9
4
16
9
