In [2]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [42]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

In [41]:
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import colorlover as cl
from IPython.display import HTML


sns.set_style('darkgrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [5]:
act = pd.read_csv('../data/act.csv')
sat = pd.read_csv('../data/sat.csv')
income = pd.read_csv('../data/ACS_11_1YR_R1901.US01PRF/ACS_11_1YR_R1901.US01PRF_with_ann.csv')

In [6]:
# Need to reassign Participation Rates from object to integer

sat.Participation = sat.Participation.map(lambda x: (x.replace("%", "")))

In [7]:
act.Participation = act.Participation.map(lambda x: float(x.replace("%", "")))

In [8]:
income.drop(1, axis=0, inplace=True)

In [9]:
income.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,GRT_STUB.target-geo-id,GRT_STUB.target-geo-id2,GRT_STUB.rank-label,GRT_STUB.display-label,EST,MOE
0,Id,Id2,Geography,Target Geo Id,Target Geo Id2,Rank,Geographical Area,Dollar,Margin of Error
2,0100000US,,United States,0400000US24,24,1,Maryland,70004,804
3,0100000US,,United States,0400000US02,02,2,Alaska,67825,1948
4,0100000US,,United States,0400000US34,34,3,New Jersey,67458,721
5,0100000US,,United States,0400000US09,09,4,Connecticut,65753,854


In [10]:
income_column_rename = {
  'GRT_STUB.display-label': 'State',
    'EST' : 'Average State Income',
}

income.rename(columns=income_column_rename, inplace=True)

In [11]:
# Resave it as new csv
income.to_csv('income.csv', index=False)

In [12]:
#Read it back into Pandas as csv
pd.read_csv('income.csv')

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,GRT_STUB.target-geo-id,GRT_STUB.target-geo-id2,GRT_STUB.rank-label,State,Average State Income,MOE
0,Id,Id2,Geography,Target Geo Id,Target Geo Id2,Rank,Geographical Area,Dollar,Margin of Error
1,0100000US,,United States,0400000US24,24,1,Maryland,70004,804
2,0100000US,,United States,0400000US02,02,2,Alaska,67825,1948
3,0100000US,,United States,0400000US34,34,3,New Jersey,67458,721
4,0100000US,,United States,0400000US09,09,4,Connecticut,65753,854
5,0100000US,,United States,0400000US11,11,5,District of Columbia,63124,2407
6,0100000US,,United States,0400000US25,25,6,Massachusetts,62859,902
7,0100000US,,United States,0400000US33,33,7,New Hampshire,62647,1415
8,0100000US,,United States,0400000US51,51,8,Virginia,61882,507
9,0100000US,,United States,0400000US15,15,9,Hawaii,61821,1035


In [13]:
income.sort_index(ascending=False)
income.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,GRT_STUB.target-geo-id,GRT_STUB.target-geo-id2,GRT_STUB.rank-label,State,Average State Income,MOE
0,Id,Id2,Geography,Target Geo Id,Target Geo Id2,Rank,Geographical Area,Dollar,Margin of Error
2,0100000US,,United States,0400000US24,24,1,Maryland,70004,804
3,0100000US,,United States,0400000US02,02,2,Alaska,67825,1948
4,0100000US,,United States,0400000US34,34,3,New Jersey,67458,721
5,0100000US,,United States,0400000US09,09,4,Connecticut,65753,854


In [14]:
# Remove the unneccesary column:
sat.drop('Unnamed: 0', axis=1, inplace=True)

In [15]:
# Add a column for SAT:
sat['colname'] = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [16]:
sat_column_rename = {
  'colname': 'code'
}

sat.rename(columns=sat_column_rename, inplace=True)

In [17]:
# Resave it as new csv
sat.to_csv('sat.csv', index=False)

In [18]:
#Read it back into Pandas as csv
pd.read_csv('sat.csv')

Unnamed: 0,State,Participation,Evidence-Based Reading and Writing,Math,Total,code
0,Alabama,5,593,572,1165,AL
1,Alaska,38,547,533,1080,AK
2,Arizona,30,563,553,1116,AZ
3,Arkansas,3,614,594,1208,AR
4,California,53,531,524,1055,CA
5,Colorado,11,606,595,1201,CO
6,Connecticut,100,530,512,1041,CT
7,Delaware,100,503,492,996,DE
8,District of Columbia,100,482,468,950,DC
9,Florida,83,520,497,1017,FL


In [19]:
# Remove the unneccesary column:
act.drop('Unnamed: 0', axis=1, inplace=True)

In [20]:
# Remove the National row:
act.drop(0, axis=0, inplace=True)

In [21]:
# Add a column for ACT:
act['code'] = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [22]:
act.to_csv('act.csv', index=False)

In [23]:
pd.read_csv('act.csv')

Unnamed: 0,State,Participation,English,Math,Reading,Science,Composite,code
0,Alabama,100.0,18.9,18.4,19.7,19.4,19.2,AL
1,Alaska,65.0,18.7,19.8,20.4,19.9,19.8,AK
2,Arizona,62.0,18.6,19.8,20.1,19.8,19.7,AZ
3,Arkansas,100.0,18.9,19.0,19.7,19.5,19.4,AR
4,California,31.0,22.5,22.7,23.1,22.2,22.8,CA
5,Colorado,100.0,20.1,20.3,21.2,20.9,20.8,CO
6,Connecticut,31.0,25.5,24.6,25.6,24.6,25.2,CT
7,Delaware,18.0,24.1,23.4,24.8,23.6,24.1,DE
8,District of Columbia,32.0,24.4,23.5,24.9,23.5,24.2,DC
9,Florida,73.0,19.0,19.4,21.0,19.4,19.8,FL


In [24]:
import plotly.plotly as py
import pandas as pd

df = sat

for col in df.columns:
    df[col] = df[col].astype(str)

df['text'] = df['State'] + '<br>' +\
'Participation: '+df['Participation']


data = [ dict(
        type='choropleth',
        colorscale = 'Viridis',
# 'Blackbody'
# ‘Bluered’,
# ‘Blues’,
# ‘Earth’,
# ‘Electric’,
# ‘Greens’,
# ‘Greys’,
# ‘Hot’,
# ‘Jet’,
# ‘Picnic’,
# ‘Portland’,
# ‘Rainbow’,
# ‘RdBu’,
# ‘Reds’,
# ‘Viridis’,
# ‘YlGnBu’,
# ‘YlOrRd’
        autocolorscale = False,
        locations = df['code'],
        z = df['Participation'].astype(float),
        locationmode = 'USA-states',
        text = df['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Percentage Rates")
        ) ]

layout = dict(
        title = 'SAT Participation Rates by State',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig)

In [25]:
sat.Total = sat.Total.astype(int)

In [26]:
sat.sort_values(['Total'], inplace=True)

In [27]:
layout = go.Layout(
    autosize=True,
    title="SAT Composite Score for All States", 
    xaxis={'title':''}, 
    yaxis={'title':'Average Total Score', 'range' : [900, 1300]})

#      Valid attributes for Layout:

#     'angularaxis', 'annotations', 'autosize', 'bargap', 'bargroupgap',
#     'barmode', 'barnorm', 'boxgap', 'boxgroupgap', 'boxmode', 'calendar',
#     'colorway', 'datarevision', 'direction', 'dragmode', 'font', 'geo',
#     'grid', 'height', 'hiddenlabels', 'hiddenlabelssrc', 'hidesources',
#     'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend',
#     'mapbox', 'margin', 'orientation', 'paper_bgcolor', 'plot_bgcolor',
#     'polar', 'radialaxis', 'scene', 'separators', 'shapes', 'showlegend',
#     'sliders', 'spikedistance', 'ternary', 'title', 'titlefont',
#     'updatemenus', 'violingap', 'violingroupgap', 'violinmode', 'width',
#     'xaxis', 'yaxis'

x0=sat.State
y0=sat.Total

trace1 = go.Bar(x=x0, y=y0,
               marker={'color': 'rgb(153, 255, 153)'})

# Valid attributes for Barplot:

data = [trace1]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)

In [28]:
# Correlation chart betwen SAT participation and composite score

layout = go.Layout(
    autosize=True,
    title="Correlation between Participation Rates and Total SAT Score", 
    xaxis={'title':'Participation Rate'}, 
    yaxis={'title':'Average Total Score', 'range' : [800, 1400]})

#      Valid attributes for Layout:

#     'angularaxis', 'annotations', 'autosize', 'bargap', 'bargroupgap',
#     'barmode', 'barnorm', 'boxgap', 'boxgroupgap', 'boxmode', 'calendar',
#     'colorway', 'datarevision', 'direction', 'dragmode', 'font', 'geo',
#     'grid', 'height', 'hiddenlabels', 'hiddenlabelssrc', 'hidesources',
#     'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend',
#     'mapbox', 'margin', 'orientation', 'paper_bgcolor', 'plot_bgcolor',
#     'polar', 'radialaxis', 'scene', 'separators', 'shapes', 'showlegend',
#     'sliders', 'spikedistance', 'ternary', 'title', 'titlefont',
#     'updatemenus', 'violingap', 'violingroupgap', 'violinmode', 'width',
#     'xaxis', 'yaxis'

x0=sat.Participation
y0=sat.Total

trace1 = go.Scatter(x=x0, y=y0,
                    mode = 'markers',
                    marker={'color': np.random.randn(500),
                            'colorscale' : 'Viridis',
                            'size' : 14,
                            'showscale' : True,
                            'colorbar' : {'title' : 'Correlation Matrix'}})

# Valid attributes for Barplot:

data = [trace1]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)

In [29]:
participation_over_50 = sat.Participation.astype(int) > 50

In [30]:
participation_over_75 = sat.Participation.astype(int) > 75

In [31]:
sat_not_mandatory = sat.Participation.astype(int) != 100

In [33]:
sat[sat_not_mandatory].head()

Unnamed: 0,State,Participation,Evidence-Based Reading and Writing,Math,Total,code,text
12,Idaho,93,513,493,1005,ID,Idaho<br>Participation: 93
19,Maine,95,513,499,1012,ME,Maine<br>Participation: 95
9,Florida,83,520,497,1017,FL,Florida<br>Participation: 83
43,Texas,62,513,507,1020,TX,Texas<br>Participation: 62
36,Oklahoma,7,530,517,1047,OK,Oklahoma<br>Participation: 7


In [34]:
sat_not_mandatory_and_over_50 = (sat.Participation.astype(int) != 100) & (sat.Participation.astype(int) > 50)

In [37]:
sat[sat_not_mandatory_and_over_50].head()

Unnamed: 0,State,Participation,Evidence-Based Reading and Writing,Math,Total,code,text
12,Idaho,93,513,493,1005,ID,Idaho<br>Participation: 93
19,Maine,95,513,499,1012,ME,Maine<br>Participation: 95
9,Florida,83,520,497,1017,FL,Florida<br>Participation: 83
43,Texas,62,513,507,1020,TX,Texas<br>Participation: 62
10,Georgia,61,535,515,1050,GA,Georgia<br>Participation: 61


In [38]:
layout = go.Layout(
    autosize=True,
    title="SAT Composite Score for Non-Mandatory States with Participation Rates Higher than 50%", 
    xaxis={'title':''}, 
    yaxis={'title':'Average Total Score', 'range' : [900, 1100]})

#      Valid attributes for Layout:

#     'angularaxis', 'annotations', 'autosize', 'bargap', 'bargroupgap',
#     'barmode', 'barnorm', 'boxgap', 'boxgroupgap', 'boxmode', 'calendar',
#     'colorway', 'datarevision', 'direction', 'dragmode', 'font', 'geo',
#     'grid', 'height', 'hiddenlabels', 'hiddenlabelssrc', 'hidesources',
#     'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend',
#     'mapbox', 'margin', 'orientation', 'paper_bgcolor', 'plot_bgcolor',
#     'polar', 'radialaxis', 'scene', 'separators', 'shapes', 'showlegend',
#     'sliders', 'spikedistance', 'ternary', 'title', 'titlefont',
#     'updatemenus', 'violingap', 'violingroupgap', 'violinmode', 'width',
#     'xaxis', 'yaxis'

x0=sat[sat_not_mandatory_and_over_50].State
y0=sat.Total

trace1 = go.Bar(x=x0, y=y0,
               marker={'color': 'rgb(153, 255, 153)'})

# Valid attributes for Barplot:

data = [trace1]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)

In [39]:
df = act

for col in df.columns:
    df[col] = df[col].astype(str)

df['text'] = df['State'] + '<br>' +\
'Participation: '+df['Participation']


data = [ dict(
        type='choropleth',
        colorscale = 'Viridis',
# 'Blackbody'
# ‘Bluered’,
# ‘Blues’,
# ‘Earth’,
# ‘Electric’,
# ‘Greens’,
# ‘Greys’,
# ‘Hot’,
# ‘Jet’,
# ‘Picnic’,
# ‘Portland’,
# ‘Rainbow’,
# ‘RdBu’,
# ‘Reds’,
# ‘Viridis’,
# ‘YlGnBu’,
# ‘YlOrRd’
        autocolorscale = False,
        locations = df['code'],
        z = df['Participation'].astype(float),
        locationmode = 'USA-states',
        text = df['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Percentage Rates")
        ) ]

layout = dict(
        title = 'ACT Participation Rates by State',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig)

In [40]:
layout = go.Layout(
    autosize=False,
    barmode='overlay', 
    title="SAT v. ACT Participation", 
    xaxis={'title':'Percentage'}, 
    yaxis={'title':'Count'})

#      Valid attributes for Layout:

#     'angularaxis', 'annotations', 'autosize', 'bargap', 'bargroupgap',
#     'barmode', 'barnorm', 'boxgap', 'boxgroupgap', 'boxmode', 'calendar',
#     'colorway', 'datarevision', 'direction', 'dragmode', 'font', 'geo',
#     'grid', 'height', 'hiddenlabels', 'hiddenlabelssrc', 'hidesources',
#     'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend',
#     'mapbox', 'margin', 'orientation', 'paper_bgcolor', 'plot_bgcolor',
#     'polar', 'radialaxis', 'scene', 'separators', 'shapes', 'showlegend',
#     'sliders', 'spikedistance', 'ternary', 'title', 'titlefont',
#     'updatemenus', 'violingap', 'violingroupgap', 'violinmode', 'width',
#     'xaxis', 'yaxis'


x0 = act.Participation
x1 = sat.Participation
hist_data = [x0, x1]

trace1 = go.Histogram(x=x0, 
                      autobinx=False, 
                      name=("ACT Participation"),
                      marker={'color': 'rgb(255, 255, 102)'},
                      opacity=0.75, 
                      xbins={'start': 0, 'end': 101, 'size': 10})

trace2 = go.Histogram(x=x1, 
                      autobinx=False, 
                      name=("SAT Participation"), 
                      marker={'color': 'rgb(0, 204, 204)'},
                      opacity=0.75, 
                      xbins={'start': 0, 'end': 101, 'size': 10})

# Valid attributes for histogram:

#     'autobinx', 'autobiny', 'bardir', 'cumulative', 'customdata',
#     'customdatasrc', 'error_x', 'error_y', 'histfunc', 'histnorm',
#     'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'ids', 'idssrc',
#     'legendgroup', 'marker', 'name', 'nbinsx', 'nbinsy', 'opacity',
#     'orientation', 'selected', 'selectedpoints', 'showlegend', 'stream',
#     'text', 'textsrc', 'type', 'uid', 'unselected', 'visible', 'x',
#     'xaxis', 'xbins', 'xcalendar', 'xsrc', 'y', 'yaxis', 'ybins',
#     'ycalendar', 'ysrc'

data = [trace1, trace2]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)