In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Load min-temp and max-temp data from multiple cities
# Turn all of that data into a single data frame with state, city, date, min, and max temp

import glob

all_dfs = [] 

for one_filename in glob.glob('../data/*,*.csv'): 
    print(f'Loading {one_filename}...')

    city, state = (
        one_filename
        .removeprefix('../data/')
        .removesuffix('.csv')
        .split(',')
    )


    one_df = (
        pd
        .read_csv(one_filename,
                  usecols=[0, 1, 2], 
                  names=['date_time',
                         'max_temp',
                         'min_temp'], 
                  header=0) 
        .assign(city=city.replace('+', ' ').title(), 
                state=state.upper()) 
    )

    all_dfs.append(one_df) 

df = pd.concat(all_dfs) 

Loading ../data/san+francisco,ca.csv...
Loading ../data/new+york,ny.csv...
Loading ../data/springfield,ma.csv...
Loading ../data/boston,ma.csv...
Loading ../data/springfield,il.csv...
Loading ../data/albany,ny.csv...
Loading ../data/los+angeles,ca.csv...
Loading ../data/chicago,il.csv...


In [3]:
# Does this data set reflect the same time period for each city and state?
df.groupby(['state', 'city'])['date_time'].min().sort_values()

state  city         
CA     Los Angeles      2018-12-11 00:00:00
       San Francisco    2018-12-11 00:00:00
IL     Chicago          2018-12-11 00:00:00
       Springfield      2018-12-11 00:00:00
MA     Boston           2018-12-11 00:00:00
       Springfield      2018-12-11 00:00:00
NY     Albany           2018-12-11 00:00:00
       New York         2018-12-11 00:00:00
Name: date_time, dtype: object

In [4]:
df.groupby(['state', 'city'])['date_time'].max().sort_values()

state  city         
CA     Los Angeles      2019-03-11 21:00:00
       San Francisco    2019-03-11 21:00:00
IL     Chicago          2019-03-11 21:00:00
       Springfield      2019-03-11 21:00:00
MA     Boston           2019-03-11 21:00:00
       Springfield      2019-03-11 21:00:00
NY     Albany           2019-03-11 21:00:00
       New York         2019-03-11 21:00:00
Name: date_time, dtype: object

In [5]:
# What is the lowest min temperature in each city during our time period?
df.groupby(['state', 'city'])['min_temp'].min()

state  city         
CA     Los Angeles       4
       San Francisco     3
IL     Chicago         -28
       Springfield     -25
MA     Boston          -14
       Springfield     -20
NY     Albany          -19
       New York        -14
Name: min_temp, dtype: int64

In [6]:
# What is the highest max temperature in each *state* during our time period?
df.groupby('state')['max_temp'].max()

state
CA    23
IL    16
MA    17
NY    15
Name: max_temp, dtype: int64