In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import bar_chart_race as bcr

In [2]:
def build_data(var_to_graph): # This function imports the institution name by default + 1 var you specify. It outputs a formatted dataframe w/institutions as columns
    names = []
    csvs = glob.glob('../data/raw/MERGED*.csv')

    #make a long df
    for proj in csvs:
        frame = pd.read_csv(proj, usecols=['INSTNM', var_to_graph]).dropna()
        frame['year'] = int(proj[-14:-10])
        names.append(frame)
    
    # unlongify the data
    names = pd.concat(names)
    x = pd.pivot_table(names, values=var_to_graph, index=['INSTNM'], columns='year', aggfunc=np.sum)
    
    #replace NaN w/0
    x.fillna(0, inplace=True)
    x.sort_values(list(x.columns), inplace=True)
    x[2021] = x[2020] # This adds an extra frame at the end. DELETE if you don't want it.
    x = x.sort_index()
    
    return x.T

In [3]:
def get_top_x(x, df): #gets the top x universities in a given dataframe
    top_x = set()

    for index, row in df.iterrows():
        top_x |= set(row[row > 0].sort_values(ascending=False).head(x).index)

    df = df[top_x]
    return df

In [4]:
df = build_data('UGDS')
df = get_top_x(5, df)

  df = df[top_x]


In [5]:
#build fast chart
bcr.bar_chart_race(df=df, 
                   bar_label_size=8,
                   tick_label_size=10,
                   title_size='large',
                   period_label={'x': .99, 'y': .1, 'ha': 'right', 'color': 'red', 'size': 20},
                   n_bars = 5, 
                   sort='desc', 
                   steps_per_period=10,
                   period_length=100,
                   figsize=(8,4.5),
                   dpi=144,
                   cmap='pastel1',
                   title='Which University\'s enrolled the most students between 1996-2020?', 
                   filename='fastTEST.mp4')

# bcr.bar_chart_race(df, bar_label_size=4, tick_label_size=5,
#                    title='COVID-19 Deaths by Country', title_size='smaller')

  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))


In [6]:
# Additional cleaning cell - depends on output of previous chart
df['University of Phoenix'] = df['University of Phoenix-Arizona'] + df['University of Phoenix-Online Campus']
df.drop(["University of Phoenix-Arizona", "University of Phoenix-Online Campus", "University of Phoenix"], 1, inplace=True)

#delete 0 dudes
df = df.replace(0,np.nan).dropna(axis=1,how="any")

  df.drop(["University of Phoenix-Arizona", "University of Phoenix-Online Campus", "University of Phoenix"], 1, inplace=True)


In [7]:
# final chart cell
def make_chart(df, title, filename):
    confirm = input('If you want to spend a lot of time rendering this plot, type "yes"')
    if confirm == 'yes':
        bcr.bar_chart_race(df=df,
                       bar_label_size=16,
                       tick_label_size=20,
                       title_size=25,
                       period_label={'x': .99, 'y': .1, 'ha': 'right', 'color': 'black', 'size': 25},
                       n_bars = 5, 
                       sort='desc', 
                       steps_per_period=30, #frames per period
                       period_length=1000, #length in miliseconds 
                       figsize=(16,9),
                       dpi=300,
                       cmap='antique', # pastel1
                       title=title, 
                       filename=filename+'.mp4')
    else:
        print("understandable, have a nice day.")

In [8]:
make_chart(df, 'What are the five largest universities in the US?', 'highest_enrollment_16x9_1500_bigFont_antique')

If you want to spend a lot of time rendering this plot, type "yes" yes


  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))
