In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
import glob

all_dfs = []

for one_filename in glob.glob('../data/*,*.csv'):
    print(f'Loading {one_filename}...')
    city, state = one_filename.removeprefix('../data/').removesuffix('.csv').split(',')
    one_df = pd.read_csv(one_filename, 
                     usecols=[1, 2, 19],
                     names=['max_temp', 'min_temp', 'precipMM'],
                     header=0)
    one_df['city'] = city.replace('+', ' ').title()
    one_df['state'] = state.upper()
    all_dfs.append(one_df)

df = pd.concat(all_dfs)
df.head()

Loading ../data/san+francisco,ca.csv...
Loading ../data/new+york,ny.csv...
Loading ../data/springfield,ma.csv...
Loading ../data/boston,ma.csv...
Loading ../data/springfield,il.csv...
Loading ../data/albany,ny.csv...
Loading ../data/los+angeles,ca.csv...
Loading ../data/chicago,il.csv...


Unnamed: 0,max_temp,min_temp,precipMM,city,state
0,13,8,0.0,San Francisco,CA
1,13,8,0.0,San Francisco,CA
2,13,8,0.0,San Francisco,CA
3,13,8,0.0,San Francisco,CA
4,13,8,0.0,San Francisco,CA


# Beyond 1

Implement the first version of `has_multiple_readings_at_least`, which just takes a single argument (`df`), but with `lambda`.

In [5]:
(
    df
    .groupby(['city', 'state'])
    .filter(lambda df_: df_.loc[df_['precipMM'] > 15, 'precipMM'].count() >= 3)
    [['city', 'state']]
    .drop_duplicates()
)

Unnamed: 0,city,state
0,New York,NY
0,Boston,MA
0,Los Angeles,CA


# Beyond 2

Implement the second version of `has_multiple_readings_at_least`, which just takes a three arguments (`df`, `min_mm`, and `times`), but with `lambda`.

In [9]:
# We need to pass all arguments as positional, meaning that we'll need to pass 
# False explicitly before min_mm and times

(
    df
    .groupby(['city', 'state'])
    .filter(lambda df_, min_mm, times: df_.loc[df_['precipMM'] > min_mm, 'precipMM'].count() >= times, min_mm=15, times=3)
    [['city', 'state']]
    .drop_duplicates()
)

Unnamed: 0,city,state
0,New York,NY
0,Boston,MA
0,Los Angeles,CA


# Beyond 3

Implement our transformation, but replacing `proportion_of_city_precip` with a `lambda`. Then find the readings that represented the greatest proportion of rainfall for each city.

In [10]:
df['precip_pct'] = (df
                    .groupby('city')['precipMM']
                    .transform(lambda s: s/s.sum())
                   )

df.groupby(['city', 'state'])['precip_pct'].max() 

city           state
Albany         NY       0.029228
Boston         MA       0.048302
Chicago        IL       0.057257
Los Angeles    CA       0.059242
New York       NY       0.055149
San Francisco  CA       0.056509
Springfield    IL       0.030977
               MA       0.023459
Name: precip_pct, dtype: float64