In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
from pathlib import Path

all_dfs = []

for one_filename in Path('../data').glob('*,*.csv'):
    print(f'Loading {one_filename}...')
    city, state = (
        Path(one_filename)
        .stem
        .removesuffix('.csv')
        .split(',')
    )
    one_df = pd.read_csv(one_filename, 
                     usecols=[1, 2, 19],
                     names=['max_temp', 'min_temp', 'precipMM'],
                     header=0)
    one_df['city'] = city.replace('+', ' ').title()
    one_df['state'] = state.upper()
    all_dfs.append(one_df)

df = pd.concat(all_dfs)
df.head()

Loading ../data/albany,ny.csv...
Loading ../data/boston,ma.csv...
Loading ../data/chicago,il.csv...
Loading ../data/los+angeles,ca.csv...
Loading ../data/new+york,ny.csv...
Loading ../data/san+francisco,ca.csv...
Loading ../data/springfield,il.csv...
Loading ../data/springfield,ma.csv...


Unnamed: 0,max_temp,min_temp,precipMM,city,state
0,-2,-8,0.0,Albany,NY
1,-2,-8,0.0,Albany,NY
2,-2,-8,0.0,Albany,NY
3,-2,-8,0.0,Albany,NY
4,-2,-8,0.0,Albany,NY


In [9]:
df

Unnamed: 0,max_temp,min_temp,precipMM,city,state,precip_pct
0,-2,-8,0.0,Albany,NY,0.0
1,-2,-8,0.0,Albany,NY,0.0
2,-2,-8,0.0,Albany,NY,0.0
3,-2,-8,0.0,Albany,NY,0.0
4,-2,-8,0.0,Albany,NY,0.0
...,...,...,...,...,...,...
723,5,-2,0.0,Springfield,MA,0.0
724,5,-2,0.0,Springfield,MA,0.0
725,5,-2,0.0,Springfield,MA,0.0
726,5,-2,0.0,Springfield,MA,0.0


In [12]:
# Which cities had, on at least 3 occasions, precipitation of 15 mm or more?

# Technique 1: Using a function without extra arguments
def has_multiple_readings_at_least(mini_df):
    return mini_df.loc[
        mini_df['precipMM'] >= 15,
        'precipMM'
        ].count() >= 3
    
(
    df
    .groupby(['city', 'state'])
    .filter(has_multiple_readings_at_least)
    [['city', 'state']]
    .drop_duplicates()
)


Unnamed: 0,city,state
0,Boston,MA
0,Los Angeles,CA
0,New York,NY


In [5]:
# Technique 2: Using a function with extra arguments

def has_multiple_readings_at_least(mini_df, min_mm, times):
    return mini_df.loc[
        mini_df['precipMM'] >= min_mm,
        'precipMM'
        ].count() >= times

(
    df
    .groupby(['city', 'state'])
    .filter(has_multiple_readings_at_least,
            min_mm=15,
            times=3)
    [['city', 'state']]
    .drop_duplicates()
)

Unnamed: 0,city,state
0,Boston,MA
0,Los Angeles,CA
0,New York,NY


In [6]:
# Find cities that had at least 3 measurements of 10 mm precipitation or more, when the temperature was below 0 Celsuius.

def has_multiple_readings_at_least(mini_df, min_mm, times):
    return mini_df.loc[
        ((mini_df['precipMM'] >= min_mm) &
         (mini_df['min_temp'] <= 0)),
        'precipMM'
        ].count() >= times

(
    df
    .groupby(['city', 'state'])
    .filter(has_multiple_readings_at_least, min_mm=10, times=3)
    [['city', 'state']]
    .drop_duplicates()
)


Unnamed: 0,city,state
0,Albany,NY
0,Boston,MA
0,New York,NY


In [7]:
# For each precipitation measurement, calculate the proportion of that city's total precipitation.
def proportion_of_city_precip(s):
    return s / s.sum()

df['precip_pct'] = (
    df
    .groupby('city')['precipMM']
    .transform(proportion_of_city_precip)
)

In [8]:
df.groupby(['city', 'state'])['precip_pct'].max()

city           state
Albany         NY       0.029228
Boston         MA       0.048302
Chicago        IL       0.057257
Los Angeles    CA       0.059242
New York       NY       0.055149
San Francisco  CA       0.056509
Springfield    IL       0.030977
               MA       0.023459
Name: precip_pct, dtype: float64