# Load data, import modules, cleanup data

In [258]:
# Adding the data dictionary to help with labeling information
!wget https://collegescorecard.ed.gov/assets/CollegeScorecardDataDictionary.xlsx &> /dev/null
!echo "Dictionary downloaded"

Dictionary downloaded


In [259]:
from google.colab import drive
from google.colab import files

import os
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
# import seaborn as sbn
# import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.options.display.float_format = ':,.2f'.format


In [260]:
drive.mount("/content/gdrive") # mount google drive, will prompt for permission

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [261]:
# read in dataset 
df = pd.read_csv("gdrive/MyDrive/W200_Project_2/w200_dataset.csv") # from my drive, change path to gdrive/Shareddrives... if not located in MyDrive
ref_df = pd.read_csv("gdrive/MyDrive/W200_Project_2/w200_reference_table.csv") # reference table that contains columns and column descriptions
df_dict = pd.read_excel("./CollegeScorecardDataDictionary.xlsx", sheet_name='Institution_Data_Dictionary')

df = df.drop(columns = 'Unnamed: 0')
ref_df = ref_df.drop(columns = 'Unnamed: 0')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Viewing Options

In [262]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.options.display.float_format = '{:,.2f}'.format

In [263]:
df.dtypes # exporation of the types in the main dataframe

AANAPII                            float64
ADM_RATE                           float64
ADM_RATE_ALL                       float64
ANNHI                              float64
CCBASIC                            float64
CCSIZSET                           float64
CCUGPROF                           float64
CIP01BACHL                         float64
CIP03BACHL                         float64
CIP04BACHL                         float64
CIP05BACHL                         float64
CIP09BACHL                         float64
CIP10BACHL                         float64
CIP11BACHL                         float64
CIP12BACHL                         float64
CIP13BACHL                         float64
CIP14BACHL                         float64
CIP15BACHL                         float64
CIP16BACHL                         float64
CIP19BACHL                         float64
CIP22BACHL                         float64
CIP23BACHL                         float64
CIP24BACHL                         float64
CIP25BACHL 

## ZIP Cleanup

In [264]:
# FIX ZIP CODES, 5-digit plus 4 digit extension
df['ZIP'] = df['ZIP'].astype(str).apply(lambda x: x[:5])
df['ZIP'].value_counts()


90010    255
02115    250
00961    225
11201    224
10001    219
        ... 
40701      1
42743      1
40065      1
42347      1
53590      1
Name: ZIP, Length: 6031, dtype: int64

## Dictionary Cleanup


In [265]:
df_dict.columns

Index(['NAME OF DATA ELEMENT', 'dev-category', 'developer-friendly name',
       'API data type', 'INDEX', 'VARIABLE NAME', 'VALUE', 'LABEL', 'SOURCE',
       'SHOWN/USE ON SITE', 'NOTES'],
      dtype='object')

In [266]:
# Cleanup dictionary column names
print(df_dict.columns) # before columns

# Changes to the column names to make them easier to access
new_columns = list(map(lambda st: str.replace(st, ' ', "_").lower(), df_dict.columns.tolist()))
new_columns = list(map(lambda st: str.replace(st, r'/', "_").lower(), new_columns))
new_columns = list(map(lambda st: str.replace(st, '-', "_").lower(), new_columns))

print(new_columns) # after column changes

new_col_dict = dict(zip(df_dict.columns.tolist(), new_columns))
df_dict = df_dict.rename(columns = new_col_dict)
for col in ['name_of_data_element', 'dev_category', 'developer_friendly_name', 'api_data_type', 'index', 'variable_name']: 
  # # forward fill values in data dictionary to make accessing information easier
  df_dict[col] = df_dict[col].ffill()
df_dict['value'] = df_dict['value'].replace(" ", np.nan)
df_dict['value'].value_counts(dropna = False)

Index(['NAME OF DATA ELEMENT', 'dev-category', 'developer-friendly name',
       'API data type', 'INDEX', 'VARIABLE NAME', 'VALUE', 'LABEL', 'SOURCE',
       'SHOWN/USE ON SITE', 'NOTES'],
      dtype='object')
['name_of_data_element', 'dev_category', 'developer_friendly_name', 'api_data_type', 'index', 'variable_name', 'value', 'label', 'source', 'shown_use_on_site', 'notes']


NaN    2959
1        27
0        20
2        14
3        12
4         8
5         7
6         7
8         7
7         6
12        5
13        5
11        5
9         5
15        4
-2        4
10        4
33        4
22        4
27        3
14        3
41        3
32        3
31        3
30        3
28        3
42        3
16        3
24        3
23        3
21        3
17        3
18        3
19        3
26        2
60        2
51        2
53        2
54        2
55        2
69        2
29        2
66        2
49        2
78        2
43        2
50        2
64        2
48        2
47        2
34        2
35        2
36        2
37        2
38        2
39        2
40        2
20        2
44        2
45        2
25        2
94        1
93        1
95        1
97        1
92        1
100       1
91        1
89        1
88        1
99        1
1         1
101       1
102       1
103       1
105       1
106       1
107       1
2         1
3         1
84        1
4         1
87        1
56  

## Dictionary conversions

In [267]:
# institution descriptions: 
# removing the institution columns to see which rows are completely empty. These columns have a value in practically every row \
# and make it difficult to identify the empty rows.
# These columns describe the institution
institution_cols = ['INSTNM', 'CONTROL', 'ZIP', 'STABBR', 'CITY', 'REGION', 'PREDDEG','HIGHDEG', 'SCH_DEG', 'NUMBRANCH', 'ENDYEAR', 'MAIN', 'ZIP']


In [268]:
def dictionary_getter(data, column): 
  """ Create dictionary for column based on dataset dictionary """
  values = data[data['variable_name'] == column]['value']
  labels = data[data['variable_name'] == column]['label']
  return dict(zip(values,labels))

In [269]:
def column_dict_convt(data, dictionary_data, column): 
  """ Exchange values encoded as numeric with dictionary values to turn column into categorical variable.
  Provide column name as a string, dictionary as an object and data as an object."""
  dictionary = dictionary_getter(dictionary_data, column)
  data[column] = data[column].map(dictionary)
  # Sanity check on values changed
  print(data[column].value_counts(dropna = False))

In [270]:
value_cols = pd.DataFrame(df_dict[~df_dict['value'].isna()]['variable_name'].value_counts()).reset_index().rename(columns = {'index': 'column'})

In [271]:
value_cols = pd.DataFrame(df_dict[~df_dict['value'].isna()])

In [272]:
cat_cols = df_dict[~df_dict['value'].isna()]['variable_name'].unique().tolist()

In [273]:
value_cols = value_cols[value_cols['variable_name'].isin(ref_df['column_name'].tolist())]

In [274]:
cols = ref_df[ref_df['column_name'].isin(cat_cols)]['column_name']
cols

0        AANAPII
3          ANNHI
4        CCBASIC
5       CCSIZSET
6       CCUGPROF
46       CONTROL
131         HBCU
132      HIGHDEG
133          HSI
136       LOCALE
137      LOCALE2
139         MAIN
169      MENONLY
191        NANTI
240          PBI
291      PREDDEG
292       REGION
293     RELAFFIL
301       TRIBAL
319    WOMENONLY
Name: column_name, dtype: object

In [275]:
for col in cols:
  column_dict_convt(df, value_cols, col)

NaN    163840
No       6020
Yes       166
Name: AANAPII, dtype: int64
NaN    163840
No       6152
Yes        34
Name: ANNHI, dtype: int64
NaN                                                                                         163837
Not applicable                                                                                2243
Master's Colleges & Universities: Larger Programs                                              347
Baccalaureate Colleges: Diverse Fields                                                         290
Special Focus Four-Year: Faith-Related Institutions                                            289
Special Focus Four-Year: Other Health Professions Schools                                      242
Baccalaureate Colleges: Arts & Sciences Focus                                                  237
Special Focus Two-Year: Health Professions                                                     198
Master's Colleges & Universities: Medium Programs                     

In [276]:
df.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
AANAPII,6186.0,2.0,No,6020.0,,,,,,,
ADM_RATE,46901.0,,,,0.69,0.22,0.0,0.56,0.72,0.86,1.0
ADM_RATE_ALL,51480.0,,,,0.69,0.21,0.0,0.56,0.72,0.85,1.0
ANNHI,6186.0,2.0,No,6152.0,,,,,,,
CCBASIC,6189.0,34.0,Not applicable,2243.0,,,,,,,
CCSIZSET,6189.0,19.0,Not applicable,2243.0,,,,,,,
CCUGPROF,6189.0,17.0,Not applicable,2243.0,,,,,,,
CIP01BACHL,163572.0,,,,0.03,0.17,0.0,0.0,0.0,0.0,2.0
CIP03BACHL,163572.0,,,,0.09,0.29,0.0,0.0,0.0,0.0,2.0
CIP04BACHL,163572.0,,,,0.03,0.18,0.0,0.0,0.0,0.0,2.0


In [277]:
df.dtypes.sort_values()

ENDYEAR                              int64
NUMBRANCH                            int64
MD_EARN_WNE_MALE0_P10              float64
PCIP03                             float64
PCIP01                             float64
NPT4_PUB                           float64
NPT4_PROG                          float64
NPT4_PRIV                          float64
NPT4_OTHER                         float64
NPT4_75UP_PUB                      float64
NPT4_75UP_PROG                     float64
NPT4_75UP_PRIV                     float64
NPT4_75UP_OTHER                    float64
NPT4_3075_PUB                      float64
NPT4_3075_PROG                     float64
NPT4_3075_PRIV                     float64
LATITUDE                           float64
NPT4_3075_OTHER                    float64
NPT4_048_PUB                       float64
LONGITUDE                          float64
PCIP04                             float64
PCIP05                             float64
PCIP09                             float64
PCIP10     

In [278]:
df['ENDYEAR'].value_counts()

2014    7869
2013    7862
2015    7766
2012    7746
2016    7666
2011    7470
2017    7238
2010    7217
2018    7112
2009    7055
1997    7007
2008    6971
2007    6951
1998    6934
2006    6899
2019    6807
2005    6747
2002    6725
1999    6702
2020    6694
2004    6673
2001    6654
2003    6652
2000    6609
Name: ENDYEAR, dtype: int64

## Dataset cleanup

In [279]:
pull_list = df.select_dtypes(include = 'object').columns.tolist() # pulling out object types that should be float
print(len(pull_list))
exclude = cols.tolist() + ['INSTNM', 'CITY'] # additional objects that should not be converted to float
for c in exclude: 
  pull_list.remove(c) # removing objects that are true categorical variables

print(len(pull_list))

158
136


In [280]:
# columns to change into float format instead of object because there are values for 
# PrivacySuppressed which are not useful for our analysis and block the column from being evaluated as number
changers = df[pull_list].describe() 


In [281]:
changers = changers.T.reset_index()

In [282]:
# Create list of columns that have privacy suppressed
changers_list = changers[changers['top'] == "PrivacySuppressed"]['index'].tolist() 

Assumptions: Privacy suppressed columns are changed to NaN to allow columns with numerical data to be analyzed as such

In [283]:
# changing suppressed values to NaN and changing type to float
for i in changers_list: 
  df.loc[df[i] == 'PrivacySuppressed', i] = np.nan
  df[i] = df[i].astype(float)

In [284]:
# Check the newly formatted columns
described_df = df.describe(include = 'all')\
.T\
.reset_index()\
.rename(columns = {'index': 'Columns'})

In [285]:
df_dtypes = pd.DataFrame(df.dtypes, columns = ['dtype']).reset_index().rename(columns = {'index': 'Columns'})

In [286]:
described_df = pd.merge(left = described_df, right = df_dtypes, how = 'left', left_on = 'Columns', right_on = 'Columns').reindex(columns = ['Columns', 'dtype'] + described_df.columns.tolist()[1:-1])


In [287]:
described_df.sort_values(['dtype', 'Columns'])

Unnamed: 0,Columns,dtype,count,unique,top,freq,mean,std,min,25%,50%,75%
49,ENDYEAR,int64,170026.0,,,,2008.71,6.87,1997.0,2003.0,2009.0,2015.0
239,NUMBRANCH,int64,170026.0,,,,4.04,13.24,1.0,1.0,1.0,2.0
1,ADM_RATE,float64,46901.0,,,,0.69,0.22,0.0,0.56,0.72,0.86
2,ADM_RATE_ALL,float64,51480.0,,,,0.69,0.21,0.0,0.56,0.72,0.85
7,CIP01BACHL,float64,163572.0,,,,0.03,0.17,0.0,0.0,0.0,0.0
8,CIP03BACHL,float64,163572.0,,,,0.09,0.29,0.0,0.0,0.0,0.0
9,CIP04BACHL,float64,163572.0,,,,0.03,0.18,0.0,0.0,0.0,0.0
10,CIP05BACHL,float64,163572.0,,,,0.08,0.28,0.0,0.0,0.0,0.0
11,CIP09BACHL,float64,163572.0,,,,0.17,0.39,0.0,0.0,0.0,0.0
12,CIP10BACHL,float64,163572.0,,,,0.03,0.16,0.0,0.0,0.0,0.0


In [288]:
# Columns with less than 1% of data filled
described_df[described_df['count'] < (0.01 * len(df))]

Unnamed: 0,Columns,dtype,count,unique,top,freq,mean,std,min,25%,50%,75%
137,LOCALE2,object,0.0,0.0,,,,,,,,
203,NPT41_OTHER,float64,473.0,,,,22388.79,6537.7,3018.0,17748.0,22664.0,26314.0
207,NPT42_OTHER,float64,434.0,,,,23163.26,6493.42,6837.0,18782.25,23150.0,27267.0
211,NPT43_OTHER,float64,401.0,,,,24527.35,6418.85,8261.0,20365.0,24308.0,28238.0
215,NPT44_OTHER,float64,331.0,,,,25223.29,6334.65,9314.0,20854.0,25030.0,28799.5
219,NPT45_OTHER,float64,269.0,,,,25721.33,6614.88,9318.0,20986.0,25450.0,29119.0
223,NPT4_048_OTHER,float64,475.0,,,,22505.71,6542.64,3018.0,17862.5,22723.0,26403.0
227,NPT4_3075_OTHER,float64,443.0,,,,23539.7,6504.68,6837.0,19191.0,23476.0,27477.5
231,NPT4_75UP_OTHER,float64,340.0,,,,25353.94,6345.59,9315.0,20911.75,25269.5,28906.5
235,NPT4_OTHER,float64,476.0,,,,22691.46,6591.86,3018.0,18150.75,22863.5,26597.0


# Exploratory

In [289]:
df.shape

(170026, 321)

In [290]:
df.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
AANAPII,6186.0,2.0,No,6020.0,,,,,,,
ADM_RATE,46901.0,,,,0.69,0.22,0.0,0.56,0.72,0.86,1.0
ADM_RATE_ALL,51480.0,,,,0.69,0.21,0.0,0.56,0.72,0.85,1.0
ANNHI,6186.0,2.0,No,6152.0,,,,,,,
CCBASIC,6189.0,34.0,Not applicable,2243.0,,,,,,,
CCSIZSET,6189.0,19.0,Not applicable,2243.0,,,,,,,
CCUGPROF,6189.0,17.0,Not applicable,2243.0,,,,,,,
CIP01BACHL,163572.0,,,,0.03,0.17,0.0,0.0,0.0,0.0,2.0
CIP03BACHL,163572.0,,,,0.09,0.29,0.0,0.0,0.0,0.0,2.0
CIP04BACHL,163572.0,,,,0.03,0.18,0.0,0.0,0.0,0.0,2.0


In [291]:
df.columns.tolist()

['AANAPII',
 'ADM_RATE',
 'ADM_RATE_ALL',
 'ANNHI',
 'CCBASIC',
 'CCSIZSET',
 'CCUGPROF',
 'CIP01BACHL',
 'CIP03BACHL',
 'CIP04BACHL',
 'CIP05BACHL',
 'CIP09BACHL',
 'CIP10BACHL',
 'CIP11BACHL',
 'CIP12BACHL',
 'CIP13BACHL',
 'CIP14BACHL',
 'CIP15BACHL',
 'CIP16BACHL',
 'CIP19BACHL',
 'CIP22BACHL',
 'CIP23BACHL',
 'CIP24BACHL',
 'CIP25BACHL',
 'CIP26BACHL',
 'CIP27BACHL',
 'CIP29BACHL',
 'CIP30BACHL',
 'CIP31BACHL',
 'CIP38BACHL',
 'CIP39BACHL',
 'CIP40BACHL',
 'CIP41BACHL',
 'CIP42BACHL',
 'CIP43BACHL',
 'CIP44BACHL',
 'CIP45BACHL',
 'CIP46BACHL',
 'CIP47BACHL',
 'CIP48BACHL',
 'CIP49BACHL',
 'CIP50BACHL',
 'CIP51BACHL',
 'CIP52BACHL',
 'CIP54BACHL',
 'CITY',
 'CONTROL',
 'COSTT4_A',
 'COSTT4_P',
 'ENDYEAR',
 'FIRSTGEN_COMP_2YR_TRANS_YR2_RT',
 'FIRSTGEN_COMP_2YR_TRANS_YR3_RT',
 'FIRSTGEN_COMP_2YR_TRANS_YR4_RT',
 'FIRSTGEN_COMP_2YR_TRANS_YR6_RT',
 'FIRSTGEN_COMP_2YR_TRANS_YR8_RT',
 'FIRSTGEN_COMP_4YR_TRANS_YR2_RT',
 'FIRSTGEN_COMP_4YR_TRANS_YR3_RT',
 'FIRSTGEN_COMP_4YR_TRANS_YR4_RT',
 

In [292]:
# Reference dataframe that has the column shorthand with the description
ref_df

Unnamed: 0,column_name,column_description
0,AANAPII,Flag for Asian American Native American Pacifi...
1,ADM_RATE,Admission rate
2,ADM_RATE_ALL,Admission rate for all campuses rolled up to t...
3,ANNHI,Flag for Alaska Native Native Hawaiian serving...
4,CCBASIC,Carnegie Classification -- basic
5,CCSIZSET,Carnegie Classification -- size and setting
6,CCUGPROF,Carnegie Classification -- undergraduate profile
7,CIP01BACHL,"Bachelor's degree in Agriculture, Agriculture ..."
8,CIP03BACHL,Bachelor's degree in Natural Resources And Con...
9,CIP04BACHL,Bachelor's degree in Architecture And Related ...


# Export dataset

In [None]:
# Export data:
df.to_csv('w200_cleaned_df.csv')
!cp w200_cleaned_df.csv /content/gdrive/MyDrive/W200_Project_2/ # add reference table to gdrive

# Subsets of the data set

In [293]:
# Percentages of degrees
pc_degrees = df.filter(regex = 'PCIP*').columns.tolist()

In [294]:
# Institution Columns
institution_cols = ['RELAFFIL', 'ADM_RATE', 'ADM_RATE_ALL', 'WOMENONLY', 'MENONLY', \
                    'NANTI', 'HSI', 'AANAPII', 'TRIBAL', 'ANNHI', 'PBI', 'HBCU', 'CCSIZSET', \
                    'CCUGPROF', 'CCBASIC', 'LONGITUDE', 'LATITUDE', 'LOCALE2', 'LOCALE', 'REGION', \
                    'CONTROL', 'HIGHDEG', 'PREDDEG', 'NUMBRANCH', 'SCH_DEG', 'ZIP', 'STABBR', 'CITY']

In [295]:
# Earnings columns
earnings = df.filter(regex = '.EARN.').columns.tolist()
earnings

['MDEARN_ALL',
 'MDEARN_PD',
 'MD_EARN_WNE_INC1_P10',
 'MD_EARN_WNE_INC1_P6',
 'MD_EARN_WNE_INC1_P8',
 'MD_EARN_WNE_INC2_P10',
 'MD_EARN_WNE_INC2_P6',
 'MD_EARN_WNE_INC2_P8',
 'MD_EARN_WNE_INC3_P10',
 'MD_EARN_WNE_INC3_P6',
 'MD_EARN_WNE_INC3_P8',
 'MD_EARN_WNE_INDEP0_P10',
 'MD_EARN_WNE_INDEP0_P6',
 'MD_EARN_WNE_INDEP0_P8',
 'MD_EARN_WNE_INDEP1_P10',
 'MD_EARN_WNE_INDEP1_P6',
 'MD_EARN_WNE_INDEP1_P8',
 'MD_EARN_WNE_MALE0_P10',
 'MD_EARN_WNE_MALE0_P6',
 'MD_EARN_WNE_MALE0_P8',
 'MD_EARN_WNE_MALE1_P10',
 'MD_EARN_WNE_MALE1_P6',
 'MD_EARN_WNE_MALE1_P8',
 'MD_EARN_WNE_P10',
 'MD_EARN_WNE_P6',
 'MD_EARN_WNE_P8',
 'MN_EARN_WNE_INC1_P10',
 'MN_EARN_WNE_INC1_P6',
 'MN_EARN_WNE_INC2_P10',
 'MN_EARN_WNE_INC2_P6',
 'MN_EARN_WNE_INC3_P10',
 'MN_EARN_WNE_INC3_P6',
 'MN_EARN_WNE_INDEP0_INC1_P10',
 'MN_EARN_WNE_INDEP0_INC1_P6',
 'MN_EARN_WNE_INDEP0_P10',
 'MN_EARN_WNE_INDEP0_P6',
 'MN_EARN_WNE_INDEP1_P10',
 'MN_EARN_WNE_INDEP1_P6',
 'MN_EARN_WNE_MALE0_P10',
 'MN_EARN_WNE_MALE0_P6',
 'MN_EARN_WNE_MAL

In [296]:
# Net price to attend
net_price = df.filter(regex="NPT.").columns.tolist()
net_price

['NPT41_OTHER',
 'NPT41_PRIV',
 'NPT41_PROG',
 'NPT41_PUB',
 'NPT42_OTHER',
 'NPT42_PRIV',
 'NPT42_PROG',
 'NPT42_PUB',
 'NPT43_OTHER',
 'NPT43_PRIV',
 'NPT43_PROG',
 'NPT43_PUB',
 'NPT44_OTHER',
 'NPT44_PRIV',
 'NPT44_PROG',
 'NPT44_PUB',
 'NPT45_OTHER',
 'NPT45_PRIV',
 'NPT45_PROG',
 'NPT45_PUB',
 'NPT4_048_OTHER',
 'NPT4_048_PRIV',
 'NPT4_048_PROG',
 'NPT4_048_PUB',
 'NPT4_3075_OTHER',
 'NPT4_3075_PRIV',
 'NPT4_3075_PROG',
 'NPT4_3075_PUB',
 'NPT4_75UP_OTHER',
 'NPT4_75UP_PRIV',
 'NPT4_75UP_PROG',
 'NPT4_75UP_PUB',
 'NPT4_OTHER',
 'NPT4_PRIV',
 'NPT4_PROG',
 'NPT4_PUB']

In [297]:
# First generation , not first gen columns
first_gen = df.filter(regex = 'FIRST')

There are barber colleges and other program colleges that are worth separating. 

In [298]:
# Institutions that are certificates
df[df['PREDDEG'] == 'Predominantly certificate-degree granting']['INSTNM'].value_counts()

Marinello School of Beauty                           129
Brittany Beauty Academy                               75
Cannella School of Hair Design-Chicago                68
Arthur's Beauty College                               68
Academy of Cosmetology                                62
                                                    ... 
Tillamook Bay Community College                        1
PADJASTYLE DBA ESCARMENT SCH PRAC & TECH FASH DES      1
Florida College of Integrative Medicine                1
ITT Technical Institute-Albany                         1
High-Tech Institute-Atlanta                            1
Name: INSTNM, Length: 6572, dtype: int64

In [299]:
# from google.colab import drive
# drive.mount('/content/drive')

# Analysis

In [300]:
# Use Seaborn to make a bar chart showing the top 10 schools with the most highest PCIP01.
import matplotlib.pyplot as plt
import seaborn as sns

df_main = df[df['MAIN'] == 1]
bar = df_main[['INSTNM','PCIP01']].sort_values(['PCIP01'], ascending = False)
bar.head(10)
plt.figure(figsize=(12,8))
ax = sns.barplot(x='INSTNM', y='PCIP01', data=bar.head(10))
plt.xticks(rotation='vertical')
plt.title('Top 10 Schools by Percentage of degrees awarded in Agriculture')
plt.show()

ValueError: ignored

<Figure size 864x576 with 0 Axes>

In [None]:
plt.figure(figsize=(12,10))
ax = sns.heatmap(bar)
plt.title('Heatmap of Top 10 Schools by Percentage of degrees awarded in Agriculture')
plt.show()

In [None]:
cntrl_types = pd.DataFrame(df['CONTROL'].value_counts()).reset_index().rename(columns= {'index': 'Institution Type', 'CONTROL':'COUNT'})
fig, ax = plt.subplots(1,1)
ax = sns.barplot(x = 'Institution Type', y = 'COUNT', data = cntrl_types)

In [None]:
fig, ax = plt.subplots(1,1)


In [None]:
# Distributions
def hist_graph(data, column):
  if df[column].min() is np.nan:
    return column + ' has no values'
  else: 
    fig, ax = plt.subplots(1,1)
    plt.hist(data[column])
    plt.title('Distribution of: '+ column)
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.show()

In [None]:
df.ADM_RATE_ALL.isnull().groupby([df['ENDYEAR']]).sum().astype(int).reset_index(name='count')

In [None]:
df_corr = df.corr()

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(df_corr)

In [None]:
from sklearn.preprocessing import StandardScaler

features = df