In [13]:
import pandas as pd
import random
import chart_studio.plotly as py
import plotly.graph_objects as go

In [133]:
bach_prog = pd.read_csv('full_bach_pdf.csv', index_col=0)
print(len(bach_prog['cip_code'].unique()))
bach_prog

106


Unnamed: 0,cip_code,total_graduates,knowledge_rate,career_outcomes_perc,total_perc_employed_overall,total_perc_employed_full_time,total_perc_employed_part_time,perc_standard_employment_overall,perc_standard_full_time,perc_standard_part_time,...,perc_fellowship_intern_part_time,perc_service,perc_military,perc_continuing_ed,perc_seeking_employment,perc_seeking_continuing_ed,perc_not_seeking,mean_starting_salary,year,perc_still_seeking_overall
0,101,1618,74.8,86.9,75.9,72.9,3.0,68.8,66.8,2.1,...,0.2,0.1,0.2,9.9,11.1,1.9,0.9,45862,2016,
1,109,1758,67.6,85.2,54.9,5.8,4.0,47.8,44.8,3.0,...,0.7,0.0,0.1,29.2,1.0,4.6,1.2,3674,2016,
2,301,3646,62.7,73.4,56.2,46.7,9.5,47.1,39.7,7.3,...,0.6,2.6,0.9,12.1,19.3,6.8,2.1,3699,2016,
3,402,1929,64.7,82.8,57.1,54.5,2.6,5.2,48.4,1.8,...,0.6,0.5,0.3,24.0,15.8,1.3,1.0,42818,2016,
4,501,1649,68.8,83.0,58.7,52.5,6.3,49.6,45.1,4.5,...,0.4,2.9,1.9,17.3,1.6,6.0,2.6,464,2016,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,5209,2662,66.1,81.5,71.8,63.5,8.4,65.1,58.3,6.8,...,0.6,0.3,0.3,7.8,16.4,1.6,1.7,38989,2021,18.1
247,5210,2426,71.8,85.2,7.5,65.2,5.3,65.8,61.4,4.4,...,0.3,0.2,0.7,13.1,11.7,3.0,0.9,562,2021,14.7
248,5213,2764,76.6,89.0,76.1,74.4,1.7,69.7,68.4,1.3,...,0.2,0.0,0.3,11.9,9.9,1.0,0.8,65843,2021,1.9
249,5214,1413,72.1,86.5,76.6,71.7,4.9,7.9,67.3,3.6,...,0.7,0.1,0.3,8.8,12.3,1.0,0.8,49481,2021,13.3


In [134]:
grouped_bach = bach_prog.groupby('cip_code').mean()
grouped_bach.columns

Index(['total_graduates', 'knowledge_rate', 'career_outcomes_perc',
       'total_perc_employed_overall', 'total_perc_employed_full_time',
       'total_perc_employed_part_time', 'perc_standard_employment_overall',
       'perc_standard_full_time', 'perc_standard_part_time',
       'perc_entrepreneur_overall', 'perc_entrepreneur_full_time',
       'perc_entrepreneur_part_time', 'perc_temp_contract_overall',
       'perc_temp_contract_full_time', 'perc_temp_contract_part_time',
       'perc_freelance_overall', 'perc_freelance_full_time',
       'perc_freelance_part_time', 'perc_fellowship_intern_overall',
       'perc_fellowship_intern_full_time', 'perc_fellowship_intern_part_time',
       'perc_service', 'perc_military', 'perc_continuing_ed',
       'perc_seeking_employment', 'perc_seeking_continuing_ed',
       'perc_not_seeking', 'mean_starting_salary', 'year',
       'perc_still_seeking_overall'],
      dtype='object')

In [135]:
def get_rand_cips(prog_data, cip_count=25):
    
    rand_cips = random.choices(prog_data['cip_code'].unique(), k=cip_count)
    return rand_cips

def histogram(prog_data, cip_list, perc_variable):
    
    fig_data = prog_data[prog_data['cip_code'].isin(cip_list)]
    grouped = fig_data.groupby('cip_code').mean()
    hist = go.Figure(data=[go.Histogram(x=grouped[perc_variable], nbinsx=20)])
    
    hist.update_layout(
        title="Distribution of "+perc_variable,
        xaxis_title=perc_variable,
        yaxis_title="Count"
    )
    
    return hist

sample_cips = get_rand_cips(bach_prog)

histogram(bach_prog, sample_cips, 'total_perc_employed_overall').show()

In [136]:
histogram(bach_prog, sample_cips, 'perc_continuing_ed').show()

In [137]:
def scatter(prog_data, cip_list):
    
    fig_data = prog_data[prog_data['cip_code'].isin(cip_list)]
    grouped = fig_data.groupby('cip_code').mean()
    scat = go.Figure(data=[go.Scatter(y=grouped['total_perc_employed_overall'], 
                                      x=grouped['perc_continuing_ed'],
                                      mode='markers',
                                      hovertext = grouped.index,
                                      hoverinfo = 'text')])
    
    
    scat.update_layout(shapes=[dict(type='line',
                                    yref='y', y0=20, y1=100,
                                    xref='x', x0=80, x1=0,
                                    opacity=0.5,
                                    line=dict(color='blue',
                                              width=2,
                                              dash='dashdot'))])
    
    return scat

sample_cips = get_rand_cips(bach_prog)

scatter(bach_prog, sample_cips)

In [199]:
import plotly.express as px
import math

sample_cips = get_rand_cips(bach_prog, 37)

def salary_mults(prog_data, cip_list):
    
    fig_data = prog_data[prog_data['cip_code'].isin(cip_list)]
    salary_mults = px.scatter(fig_data, x='year', y='mean_starting_salary', facet_col='cip_code', facet_col_wrap=6
                             ).update_traces(mode='lines+markers')
    
    n_facets = math.floor(len(cip_list)/6)
    
    salary_mults.update_yaxes(title='', nticks=3)
    salary_mults.update_xaxes(title='')
    
    salary_mults.add_annotation(xref='paper', yref='y domain', 
                                x=0.5, y=-1, text='Year', showarrow=False, font=dict(size=12))
    
    salary_mults.add_annotation(xref='x domain', yref='paper', 
                                x=-0.5, y=0.5, text='Mean Starting Salary', showarrow=False, textangle=-90,
                               font=dict(size=12))
    
    salary_mults.update_layout(uniformtext_minsize=12)
    
    
    return salary_mults
    
salary_mults(bach_prog, sample_cips).show()

In [106]:
# Not Good

def reg_salary(prog_data, cip_list):
    
    fig_data = prog_data[prog_data['cip_code'].isin(cip_list)]
    salary = px.line(fig_data, x='year', y='mean_starting_salary', color='cip_code', line_shape='spline')
    
    return salary

reg_salary(bach_prog, sample_cips).show()