In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import bar_chart_race as bcr

In [126]:
def build_data(var_to_graph): # This function imports the institution name by default + 1 var you specify. It outputs a formatted dataframe w/institutions as columns
    names = []
    csvs = glob.glob('../data/raw/MERGED*.csv')
    csvs=csvs

    #make a long df
    for proj in csvs:
        frame = pd.read_csv(proj, usecols=['INSTNM', 'UGDS', var_to_graph]).dropna()
        frame['year'] = int(proj[-14:-10])
        names.append(frame)
    
    # unlongify the data
    names = pd.concat(names)
    names = names[names['UGDS'] > 3000]
    x = pd.pivot_table(names, values=var_to_graph, index=['INSTNM'], columns='year', aggfunc='first')
    
    #replace NaN w/0
    x.fillna(0, inplace=True)
    x.sort_values(list(x.columns), inplace=True)
    x[2021] = x[2020] # This adds an extra frame at the end. DELETE if you don't want it.
    x = x.sort_index()
    
    return x.T

In [127]:
def get_top_x(x, df): #gets the top x universities in a given dataframe
    top_x = set()

    for index, row in df.iterrows():
        top_x |= set(row[row > 0].sort_values(ascending=False).head(x).index)

    df = df[top_x]
    return df

In [137]:
df = build_data('OMAWDP6_FTFT')
# df = df.loc[:, df.apply(lambda x: ~x.astype(str).str.contains('Privacy').any())]
df = get_top_x(25, df)

  df = df[top_x]


In [138]:
df

INSTNM,Cornell University,Princeton University,Hacienda La Puente Adult Education,Walden University,Vista College,University of Puerto Rico-Bayamon,University of Pennsylvania,Stanford University,Yale University,Boston College,...,Duke University,University of California-Berkeley,Brown University,University of Virginia-Main Campus,University of Michigan-Ann Arbor,University of Southern California,Rice University,Massachusetts Institute of Technology,Georgetown University,William & Mary
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015,0.9334,0.9694,0.9124,0.0,0.9769,0.3924,0.9577,0.9576,0.9779,0.913,...,0.9423,0.9103,0.9444,0.9307,0.9001,0.9117,0.9111,0.9307,0.9245,0.8993
2016,0.932,0.9678,0.9338,0.3333,0.9485,0.3518,0.9561,0.946,0.962,0.9131,...,0.9485,0.9069,0.9612,0.9416,0.9102,0.9084,0.915,0.913,0.9491,0.9044
2017,0.9361,0.9684,0.907,0.0238,0.8405,0.3408,0.9475,0.938,0.9686,0.9258,...,0.9623,0.9184,0.9597,0.9327,0.9013,0.918,0.9056,0.8991,0.9394,0.8998
2018,0.9388,0.9756,0.0,0.1333,0.0,0.3868,0.9522,0.9402,0.9368,0.9253,...,0.9451,0.9194,0.9552,0.9417,0.9119,0.9155,0.9312,0.9266,0.9426,0.9148
2019,0.9339,0.9731,0.0,1.0,0.0,0.7556,0.958,0.9425,0.9712,0.919,...,0.948,0.912,0.9514,0.9455,0.9158,0.9164,0.9136,0.9384,0.9461,0.9211
2020,0.9452,0.9638,0.0,1.0,0.0,0.9297,0.9549,0.9438,0.9661,0.9188,...,0.9562,0.9158,0.9504,0.9431,0.9193,0.9191,0.9471,0.9416,0.9413,0.9111
2021,0.9452,0.9638,0.0,1.0,0.0,0.9297,0.9549,0.9438,0.9661,0.9188,...,0.9562,0.9158,0.9504,0.9431,0.9193,0.9191,0.9471,0.9416,0.9413,0.9111


In [134]:
#build fast chart
bcr.bar_chart_race(df=df, 
                   bar_label_size=8,
                   tick_label_size=10,
                   title_size='large',
                   period_label={'x': .99, 'y': .1, 'ha': 'right', 'color': 'black', 'size': 20},
                   n_bars = 5, 
                   sort='desc', 
                   steps_per_period=10,
                   period_length=2000,
                   figsize=(8,4.5),
                   dpi=144,
                   cmap='pastel1',
                   title='Which University\'s has highest graduation rates?', 
                   filename='fastTEST.mp4')

# bcr.bar_chart_race(df, bar_label_size=4, tick_label_size=5,
#                    title='COVID-19 Deaths by Country', title_size='smaller')

  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))


In [139]:
# # Additional cleaning cell - depends on output of previous chart
# df['University of Phoenix'] = df['University of Phoenix-Arizona'] + df['University of Phoenix-Online Campus']
# df.drop(["University of Phoenix-Arizona", "University of Phoenix-Online Campus", "University of Phoenix"], 1, inplace=True)

#delete 0 dudes
df = df.replace(0,np.nan).dropna(axis=1,how="any")
df[df.select_dtypes(include=['number']).columns] *= 100

In [140]:
# final chart cell
def make_chart(df, title, filename):
    confirm = input('If you want to spend a lot of time rendering this plot, type "yes"')
    if confirm == 'yes':
        bcr.bar_chart_race(df=df,
                       bar_label_size=16,
                       tick_label_size=20,
                       title_size=25,
                       period_label={'x': .99, 'y': .1, 'ha': 'right', 'color': 'black', 'size': 25},
                       n_bars = 25, 
                       sort='desc', 
                       steps_per_period=30, #frames per period
                       period_length=1000, #length in miliseconds 
                       figsize=(16,9),
                       dpi=300,
                       cmap='antique', # pastel1
                       title=title, 
                       filename=filename+'.mp4')
    else:
        print("understandable, have a nice day.")

In [142]:
make_chart(df, 'What universities have the highest graduation rates in the US?', 'highest_gradRate_16x9_1500_bigFont_antique_25')

If you want to spend a lot of time rendering this plot, type "yes" yes


  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))
