## Setup

In [1]:
import pandas as pd
import numpy as np

import requests
from io import StringIO
import string

import utils_grouping as utils

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

%load_ext autoreload
%autoreload 2

## EXERCISE 32. Multicity temperatures

### 32.1 Main exercise

In [5]:
# san+francisco,ca.csv, new+york,ny.csv, springfield,ma.csv, boston,ma.csv,
# springfield,il.csv, albany,ny.csv, los+angeles,ca.csv, and chicago,il.csv
wdm = utils.WeatherDataManager()
weather = wdm.weather
weather.head()

Unnamed: 0,date_time,max_temp,min_temp,city,state
0,2018-12-11 00:00:00,13,8,San Francisco,CA
1,2018-12-11 03:00:00,13,8,San Francisco,CA
2,2018-12-11 06:00:00,13,8,San Francisco,CA
3,2018-12-11 09:00:00,13,8,San Francisco,CA
4,2018-12-11 12:00:00,13,8,San Francisco,CA


In [11]:
# Does the data for each city and state start and end at (roughly) the same time?
weather.groupby(['city', 'state'])['date_time'].min()

city           state
Albany         NY      2018-12-11
Boston         MA      2018-12-11
Chicago        IL      2018-12-11
Los Angeles    CA      2018-12-11
New York       NY      2018-12-11
San Francisco  CA      2018-12-11
Springfield    IL      2018-12-11
               MA      2018-12-11
Name: date_time, dtype: datetime64[ns]

In [12]:
weather.groupby(['city', 'state'])['date_time'].max()

city           state
Albany         NY      2019-03-11 21:00:00
Boston         MA      2019-03-11 21:00:00
Chicago        IL      2019-03-11 21:00:00
Los Angeles    CA      2019-03-11 21:00:00
New York       NY      2019-03-11 21:00:00
San Francisco  CA      2019-03-11 21:00:00
Springfield    IL      2019-03-11 21:00:00
               MA      2019-03-11 21:00:00
Name: date_time, dtype: datetime64[ns]

In [15]:
# What is the lowest minimum temperature recorded for each city in the data set?
weather.groupby(['city'])['min_temp'].min()

city
Albany          -19
Boston          -14
Chicago         -28
Los Angeles       4
New York        -14
San Francisco     3
Springfield     -25
Name: min_temp, dtype: int64

In [16]:
# What is the highest maximum temperature recorded in each STATE in the data set?
weather.groupby(['state'])['max_temp'].max()

state
CA    23
IL    16
MA    17
NY    15
Name: max_temp, dtype: int64

In [20]:
weather.groupby(['state', 'city'])['max_temp'].max()

state  city         
CA     Los Angeles      23
       San Francisco    15
IL     Chicago           9
       Springfield      16
MA     Boston           17
       Springfield      15
NY     Albany           13
       New York         15
Name: max_temp, dtype: int64

### 32.3 Beyond 1

In [25]:
weather.groupby(['state', 'city'])[['min_temp', 'max_temp']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,min_temp,max_temp
state,city,Unnamed: 2_level_1,Unnamed: 3_level_1
CA,Los Angeles,10.637363,17.054945
CA,San Francisco,8.252747,12.604396
IL,Chicago,-5.076923,-0.736264
IL,Springfield,-4.857143,2.076923
MA,Boston,-3.142857,2.868132
MA,Springfield,-6.032967,1.472527
NY,Albany,-5.956044,0.362637
NY,New York,-1.054945,4.208791


### 32.3 Beyond 2

In [26]:
weather

Unnamed: 0,date_time,max_temp,min_temp,city,state
0,2018-12-11 00:00:00,13,8,San Francisco,CA
1,2018-12-11 03:00:00,13,8,San Francisco,CA
2,2018-12-11 06:00:00,13,8,San Francisco,CA
3,2018-12-11 09:00:00,13,8,San Francisco,CA
4,2018-12-11 12:00:00,13,8,San Francisco,CA
...,...,...,...,...,...
5819,2019-03-11 09:00:00,3,-2,Chicago,IL
5820,2019-03-11 12:00:00,3,-2,Chicago,IL
5821,2019-03-11 15:00:00,3,-2,Chicago,IL
5822,2019-03-11 18:00:00,3,-2,Chicago,IL


In [32]:
pd.set_option('display.max_rows',100)
weather.head(100)

Unnamed: 0,date_time,max_temp,min_temp,city,state
0,2018-12-11 00:00:00,13,8,San Francisco,CA
1,2018-12-11 03:00:00,13,8,San Francisco,CA
2,2018-12-11 06:00:00,13,8,San Francisco,CA
3,2018-12-11 09:00:00,13,8,San Francisco,CA
4,2018-12-11 12:00:00,13,8,San Francisco,CA
5,2018-12-11 15:00:00,13,8,San Francisco,CA
6,2018-12-11 18:00:00,13,8,San Francisco,CA
7,2018-12-11 21:00:00,13,8,San Francisco,CA
8,2018-12-12 00:00:00,15,11,San Francisco,CA
9,2018-12-12 03:00:00,15,11,San Francisco,CA


In [33]:
pd.set_option('display.max_rows',10)
weather.head(100)

Unnamed: 0,date_time,max_temp,min_temp,city,state
0,2018-12-11 00:00:00,13,8,San Francisco,CA
1,2018-12-11 03:00:00,13,8,San Francisco,CA
2,2018-12-11 06:00:00,13,8,San Francisco,CA
3,2018-12-11 09:00:00,13,8,San Francisco,CA
4,2018-12-11 12:00:00,13,8,San Francisco,CA
...,...,...,...,...,...
95,2018-12-22 21:00:00,14,9,San Francisco,CA
96,2018-12-23 00:00:00,14,10,San Francisco,CA
97,2018-12-23 03:00:00,14,10,San Francisco,CA
98,2018-12-23 06:00:00,14,10,San Francisco,CA


## EXERCISE 33. SAT scores, revisited

### 33.1 Main Exercise

In [41]:
file_name = 'data/sat-scores.csv'
use_cols=['Year', 'State.Code', 'Total.Math', 
        'Family Income.Less than 20k.Math', 
        'Family Income.Between 20-40k.Math', 
        'Family Income.Between 40-60k.Math', 
        'Family Income.Between 60-80k.Math',
        'Family Income.Between 80-100k.Math',
        'Family Income.More than 100k.Math']

scores = pd.read_csv(file_name, usecols=use_cols)

# Rearrange the columns
scores = scores[['Year', 'State.Code', 'Total.Math',
                'Family Income.Less than 20k.Math',
                'Family Income.Between 20-40k.Math',
                'Family Income.Between 40-60k.Math',
                'Family Income.Between 60-80k.Math',
                'Family Income.Between 80-100k.Math',
                'Family Income.More than 100k.Math']]

scores.head()

Unnamed: 0,Year,State.Code,Total.Math,Family Income.Less than 20k.Math,Family Income.Between 20-40k.Math,Family Income.Between 40-60k.Math,Family Income.Between 60-80k.Math,Family Income.Between 80-100k.Math,Family Income.More than 100k.Math
0,2005,AL,559,462,513,539,550,566,588
1,2005,AK,519,464,492,517,513,528,541
2,2005,AZ,530,485,498,520,524,534,554
3,2005,AR,552,489,513,543,553,570,572
4,2005,CA,522,451,477,506,521,535,566


In [42]:
# 2 Rename the income-related column names to something shorter. I recommend
# income<20k, 20k<income<40k, 40k<income<60k, 60k<income<80k, 80k<income
# 100k, and income>100k
columns_new={
    'Family Income.Less than 20k.Math': 'income<20k',
    'Family Income.Between 20-40k.Math': '20k<income<40k',
    'Family Income.Between 40-60k.Math': '40k<income<60k',
    'Family Income.Between 60-80k.Math': '60k<income<80k',
    'Family Income.Between 80-100k.Math': '80k<income<100k',
    'Family Income.More than 100k.Math': 'income>100k'
}

scores = scores.rename(columns=columns_new)
scores.head()

Unnamed: 0,Year,State.Code,Total.Math,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
0,2005,AL,559,462,513,539,550,566,588
1,2005,AK,519,464,492,517,513,528,541
2,2005,AZ,530,485,498,520,524,534,554
3,2005,AR,552,489,513,543,553,570,572
4,2005,CA,522,451,477,506,521,535,566


In [38]:
# Set a format for floats
pd.options.display.float_format = '{:,.2f}'.format

In [44]:
# 3 Find the average SAT math score for each income level, grouped and then
# sorted by year
scores.groupby('Year').mean(numeric_only=True).sort_index()

Unnamed: 0_level_0,Total.Math,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005,535.65,427.60,488.65,522.67,536.08,548.94,572.17
2006,537.48,461.02,502.92,523.77,534.90,550.46,572.52
2007,535.34,457.92,494.85,519.49,533.19,545.70,565.17
2008,535.98,478.64,523.62,547.47,549.19,557.64,564.57
2009,540.80,482.06,527.82,550.98,553.94,565.33,585.78
...,...,...,...,...,...,...,...
2011,533.23,460.45,494.89,513.42,528.66,541.85,563.25
2012,533.60,458.77,492.06,512.45,525.77,538.30,557.32
2013,532.62,469.36,490.13,511.38,520.32,537.40,556.34
2014,534.28,459.42,497.64,514.94,527.17,543.13,555.43


In [55]:
# 4 For each year in the data set, determine how much better each income group
# did, on average, than the next-poorer group of students. Do you see (just by
# looking at the data) any income group that did worse, in any year, than the
# next-poorer students?
columns_income = ['income<20k', '20k<income<40k',
                  '40k<income<60k', '60k<income<80k',
                  '80k<income<100k', 'income>100k']

scores.groupby('Year')[columns_income].mean().T.pct_change()

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
income<20k,,,,,,,,,,,
20k<income<40k,0.14,0.09,0.08,0.09,0.09,0.05,0.07,0.07,0.04,0.08,0.1
40k<income<60k,0.07,0.04,0.05,0.05,0.04,0.05,0.04,0.04,0.04,0.03,0.05
60k<income<80k,0.03,0.02,0.03,0.0,0.01,0.02,0.03,0.03,0.02,0.02,0.03
80k<income<100k,0.02,0.03,0.02,0.02,0.02,0.03,0.02,0.02,0.03,0.03,0.03
income>100k,0.04,0.04,0.04,0.01,0.04,0.04,0.04,0.04,0.04,0.02,0.04


In [61]:
# 5 Which income bracket, on average, had the greatest advantage over the next-
# poorer income bracket?
pct_change = scores.groupby('Year')[columns_income].mean().T.pct_change()
pct_change.T.mean().sort_values(ascending=False)

20k<income<40k    0.08
40k<income<60k    0.05
income>100k       0.03
80k<income<100k   0.03
60k<income<80k    0.02
income<20k         NaN
dtype: float64

### Beyond 1

In [65]:
pct_change.T

Unnamed: 0_level_0,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2005,,0.14,0.07,0.03,0.02,0.04
2006,,0.09,0.04,0.02,0.03,0.04
2007,,0.08,0.05,0.03,0.02,0.04
2008,,0.09,0.05,0.00,0.02,0.01
2009,,0.09,0.04,0.01,0.02,0.04
...,...,...,...,...,...,...
2011,,0.07,0.04,0.03,0.02,0.04
2012,,0.07,0.04,0.03,0.02,0.04
2013,,0.04,0.04,0.02,0.03,0.04
2014,,0.08,0.03,0.02,0.03,0.02


In [66]:
pct_change.T.describe()

Unnamed: 0,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
count,0.0,11.0,11.0,11.0,11.0,11.0
mean,,0.08,0.05,0.02,0.03,0.03
std,,0.03,0.01,0.01,0.0,0.01
min,,0.04,0.03,0.0,0.02,0.01
25%,,0.07,0.04,0.02,0.02,0.04
50%,,0.08,0.04,0.02,0.02,0.04
75%,,0.09,0.05,0.03,0.03,0.04
max,,0.14,0.07,0.03,0.03,0.04


### Beyond 2

In [67]:
scores.head()

Unnamed: 0,Year,State.Code,Total.Math,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
0,2005,AL,559,462,513,539,550,566,588
1,2005,AK,519,464,492,517,513,528,541
2,2005,AZ,530,485,498,520,524,534,554
3,2005,AR,552,489,513,543,553,570,572
4,2005,CA,522,451,477,506,521,535,566


In [71]:
# Which five states have the greatest gap in SAT math scores between the richest
# and poorest students?

# Add a new column to the scores DataFrame that contains the difference
# between the income>100k and income<20k columns
scores['gap_100k_20k'] = scores['income>100k'] - scores['income<20k']

scores.groupby('State.Code')['gap_100k_20k'].mean().sort_values(ascending=False).head()

State.Code
ND   341.91
WY   246.45
DC   208.82
SD   157.00
MS   140.00
Name: gap_100k_20k, dtype: float64

### Beyond 3

In [None]:
# You analyzed math scores. If you perform the same analysis on verbal SAT
# scores, will you similarly see that wealthier students generally do better than
# poorer students? Do any income brackets do worse than the next-poorer
# bracket?

In [74]:
# Read the verbal SAT scores data
file_name = 'data/sat-scores.csv'
use_cols=['Year', 'State.Code', 'Total.Verbal',
        'Family Income.Less than 20k.Verbal',
        'Family Income.Between 20-40k.Verbal',
        'Family Income.Between 40-60k.Verbal',
        'Family Income.Between 60-80k.Verbal',
        'Family Income.Between 80-100k.Verbal',
        'Family Income.More than 100k.Verbal']
scores_verbal = pd.read_csv(file_name, usecols=use_cols)

#Rename the columns to something shorter
columns_new_verbal = {
    'Family Income.Less than 20k.Verbal': 'income<20k',
    'Family Income.Between 20-40k.Verbal': '20k<income<40k',
    'Family Income.Between 40-60k.Verbal': '40k<income<60k',
    'Family Income.Between 60-80k.Verbal': '60k<income<80k',
    'Family Income.Between 80-100k.Verbal': '80k<income<100k',
    'Family Income.More than 100k.Verbal': 'income>100k'
}
scores_verbal = scores_verbal.rename(columns=columns_new_verbal)

# Rearrange the columns
scores_verbal = scores_verbal[['Year', 'State.Code', 'Total.Verbal',
                               'income<20k', '20k<income<40k',
                               '40k<income<60k', '60k<income<80k',
                               '80k<income<100k', 'income>100k']]

scores_verbal.head()

Unnamed: 0,Year,State.Code,Total.Verbal,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
0,2005,AL,567,474,527,551,564,577,590
1,2005,AK,523,467,500,522,519,534,544
2,2005,AZ,526,474,495,518,523,533,546
3,2005,AR,563,486,526,555,570,580,589
4,2005,CA,504,421,458,494,511,525,551


In [75]:
scores_verbal.groupby('Year')[columns_income].mean().T.pct_change()

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
income<20k,,,,,,,,,,,
20k<income<40k,0.09,0.11,0.1,0.1,0.1,0.06,0.09,0.09,0.06,0.1,0.12
40k<income<60k,0.05,0.04,0.05,0.04,0.04,0.05,0.04,0.04,0.05,0.03,0.05
60k<income<80k,0.02,0.02,0.02,0.0,0.01,0.02,0.03,0.03,0.02,0.04,0.03
80k<income<100k,0.02,0.02,0.02,0.01,0.02,0.02,0.02,0.02,0.03,0.02,0.02
income>100k,0.03,0.03,0.03,0.01,0.02,0.03,0.04,0.03,0.03,0.02,0.04
