In [9]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [10]:
# Load min-temp and max-temp data from multiple cities
# Turn all of that data into a single data frame with state, city, date, min, and max temp

import glob

all_dfs = [] 

for one_filename in glob.glob('../data/*,*.csv'): 
    print(f'Loading {one_filename}...')

    city, state = (
        one_filename
        .removeprefix('../data/')
        .removesuffix('.csv')
        .split(',')
    )


    one_df = (
        pd
        .read_csv(one_filename,
                  usecols=[0, 1, 2], 
                  names=['date_time',
                         'max_temp',
                         'min_temp'], 
                  header=0) 
        .assign(city=city.replace('+', ' ').title(), 
                state=state.upper()) 
    )

    all_dfs.append(one_df) 

df = pd.concat(all_dfs) 

df.head()

Loading ../data/san+francisco,ca.csv...
Loading ../data/new+york,ny.csv...
Loading ../data/springfield,ma.csv...
Loading ../data/boston,ma.csv...
Loading ../data/springfield,il.csv...
Loading ../data/albany,ny.csv...
Loading ../data/los+angeles,ca.csv...
Loading ../data/chicago,il.csv...


Unnamed: 0,date_time,max_temp,min_temp,city,state
0,2018-12-11 00:00:00,13,8,San Francisco,CA
1,2018-12-11 03:00:00,13,8,San Francisco,CA
2,2018-12-11 06:00:00,13,8,San Francisco,CA
3,2018-12-11 09:00:00,13,8,San Francisco,CA
4,2018-12-11 12:00:00,13,8,San Francisco,CA


# Beyond 1

Run "describe" on the minimum and maximum temperature for each state-city combination

In [11]:
# Grouping by state-city combinations, get the min and max temperatures
# Then apply the `describe` method, which returns a data frame
df.groupby(['state', 'city'])[['min_temp', 'max_temp']].apply(DataFrame.describe)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min_temp,max_temp
state,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,Los Angeles,count,728.000000,728.000000
CA,Los Angeles,mean,10.637363,17.054945
CA,Los Angeles,std,2.705200,2.708640
CA,Los Angeles,min,4.000000,12.000000
CA,Los Angeles,25%,9.000000,15.000000
...,...,...,...,...
NY,New York,min,-14.000000,-12.000000
NY,New York,25%,-4.000000,2.000000
NY,New York,50%,0.000000,4.000000
NY,New York,75%,2.000000,7.000000


# Beyond 2

Running `describe` works, but by default, we only see the first and last few rows from each result. Using `pd.set_option` to change the value of `display_max_rows`, make it possible to see all of the results in Jupyter, then reset the option to 10 rows.

In [12]:
pd.set_option('display.max_rows',1000)
df.groupby(['state', 'city'])[['min_temp', 'max_temp']].apply(DataFrame.describe)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min_temp,max_temp
state,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,Los Angeles,count,728.0,728.0
CA,Los Angeles,mean,10.637363,17.054945
CA,Los Angeles,std,2.7052,2.70864
CA,Los Angeles,min,4.0,12.0
CA,Los Angeles,25%,9.0,15.0
CA,Los Angeles,50%,11.0,16.0
CA,Los Angeles,75%,12.0,19.0
CA,Los Angeles,max,17.0,23.0
CA,San Francisco,count,728.0,728.0
CA,San Francisco,mean,8.252747,12.604396


In [13]:
pd.set_option('display.max_rows',10)

# Beyond 3

What is the average difference in temperature (i.e., max - min) for each of the cities in our data set?

In [14]:
# We'll use lambda to calculate max-min for each value in the group, and then get the mean of those values
df.groupby(['state', 'city'])[['min_temp', 'max_temp']].apply(lambda g: np.mean(g.max() - g.min()) )

state  city         
CA     Los Angeles      12.0
       San Francisco     8.0
IL     Chicago          34.0
       Springfield      35.5
MA     Boston           26.0
       Springfield      28.5
NY     Albany           26.5
       New York         26.5
dtype: float64