In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
filename = '../data/sat-scores.csv'

df = pd.read_csv(filename,
                usecols=['Year', 'State.Code', 'Total.Math', 
                         'Family Income.Less than 20k.Math', 
                         'Family Income.Between 20-40k.Math', 
                         'Family Income.Between 40-60k.Math', 
                         'Family Income.Between 60-80k.Math',
                         'Family Income.Between 80-100k.Math',
                         'Family Income.More than 100k.Math'])
df.head()

Unnamed: 0,Year,State.Code,Total.Math,Family Income.Between 20-40k.Math,Family Income.Between 40-60k.Math,Family Income.Between 60-80k.Math,Family Income.Between 80-100k.Math,Family Income.Less than 20k.Math,Family Income.More than 100k.Math
0,2005,AL,559,513,539,550,566,462,588
1,2005,AK,519,492,517,513,528,464,541
2,2005,AZ,530,498,520,524,534,485,554
3,2005,AR,552,513,543,553,570,489,572
4,2005,CA,522,477,506,521,535,451,566


In [3]:
# Rename the income-related column names
df = df.rename(columns={'Family Income.Less than 20k.Math':'income<20k',
                'Family Income.Between 20-40k.Math':'20k<income<40k',
                'Family Income.Between 40-60k.Math':'40k<income<60k',
                'Family Income.Between 60-80k.Math':'60k<income<80k',
                'Family Income.Between 80-100k.Math':'80k<income<100k',
                'Family Income.More than 100k.Math':'income>100k'})


# Beyond 1

Calculate descriptive statistics for all of the changes in income brackets.  Where do we see the largest difference between income brackets?

In [4]:
change = df.groupby('Year')[['income<20k',
                      '20k<income<40k',
                      '40k<income<60k',
                      '60k<income<80k',
                      '80k<income<100k',
                      'income>100k']].mean().T.pct_change() 

# largest is for the wealthiest students, whose average scores are 
# far higher than any other income bracket.

change.T.describe() 

Unnamed: 0,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
count,0.0,11.0,11.0,11.0,11.0,11.0
mean,,0.083929,0.04526,0.020744,0.025247,0.034399
std,,0.026723,0.009055,0.008745,0.004821,0.008947
min,,0.04426,0.034768,0.003136,0.015391,0.012418
25%,,0.073665,0.04145,0.019374,0.023645,0.035291
50%,,0.083207,0.043872,0.023743,0.024947,0.036175
75%,,0.094456,0.045532,0.026016,0.028681,0.039508
max,,0.142793,0.069618,0.029694,0.032817,0.042319


# Beyond 2

Which five states have the greatest gap in SAT math scores between the richest and poorest students?

In [5]:
df['rich_poor_diff'] = df['income>100k'] - df['income<20k']

df.groupby('State.Code')['rich_poor_diff'].mean().sort_values(ascending=False).head()

State.Code
ND    341.909091
WY    246.454545
DC    208.818182
SD    157.000000
MS    140.000000
Name: rich_poor_diff, dtype: float64

# Beyond 3

We analyzed math scores. If we perform the same analysis on verbal SAT scores, will we similarly see that wealthier students generally do better than poorer students?  Are there any income brackets that do worse than the next-poorer bracket?

In [6]:
filename = '../data/sat-scores.csv'

df = pd.read_csv(filename,
                usecols=['Year', 'State.Code', 'Total.Verbal', 
                         'Family Income.Less than 20k.Verbal', 
                         'Family Income.Between 20-40k.Verbal', 
                         'Family Income.Between 40-60k.Verbal', 
                         'Family Income.Between 60-80k.Verbal',
                         'Family Income.Between 80-100k.Verbal',
                         'Family Income.More than 100k.Verbal'])

df.columns = ['Year', 'State.Code', 'Total.Verbal',
                      'income<20k',
                      '20k<income<40k',
                      '40k<income<60k',
                      '60k<income<80k',
                      '80k<income<100k',
                      'income>100k',
                      ]

df.head()

Unnamed: 0,Year,State.Code,Total.Verbal,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
0,2005,AL,567,527,551,564,577,474,590
1,2005,AK,523,500,522,519,534,467,544
2,2005,AZ,526,495,518,523,533,474,546
3,2005,AR,563,526,555,570,580,486,589
4,2005,CA,504,458,494,511,525,421,551


In [7]:

change = df.groupby('Year')[['income<20k',
                      '20k<income<40k',
                      '40k<income<60k',
                      '60k<income<80k',
                      '80k<income<100k',
                      'income>100k']].mean().T.pct_change() 

change[change <= 0].dropna()

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
80k<income<100k,-0.165165,-0.16959,-0.166568,-0.141209,-0.152243,-0.143227,-0.160203,-0.15622,-0.13885,-0.162701,-0.18397
