In [None]:
# these are our usual librairies
# be sure that these load properly
# if they don't, you may have to install the libraries
# recall that you can install them using commands similar to:
# !pip install pandas
# !pip install statsmodels
import pandas as pd
import statsmodels.api as sm

In [None]:
# be sure to have the csv file collegeadmissions.csv in the same directory as this ipynb
df = pd.read_csv("collegeadmissions.csv")

In [None]:
# some of the column heading names don't have the best format so we will fix this here
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-', '_')

In [None]:
# there is a lot of data in this csv file
# instead of keeping all of it, let's start by just selecting the information
# that we want to look at by taking the college name so we know what the data
# represents and 11 of the numerical columns and then reducing
# the dataframe to only rows where all 11 observations are avaialable
#
# the first column is:
# Name - the name of the college or university
#
# the other 11 columns are:
# Applicants total - how many applications
# Admissions total - how many admissions
# Enrolled total - how many of the admittes students chose the university or college
# ACT Composite 75th percentile score - ACT Score
# Estimated undergraduate enrollment, total
# Total price for in-state students living on campus 2013-14
# Total price for out-of-state students living on campus 2013-14
# Percent of total enrollment that are White
# Percent of undergraduate enrollment that are women
# Graduation rate - Bachelor degree within 5 years, total
# Percent of freshmen receiving any financial aid

df = df[['name', 'applicants_total', 'admissions_total',
         'enrolled_total',
         'act_composite_75th_percentile_score',
         'estimated_undergraduate_enrollment,_total',
         'total_price_for_in_state_students_living_on_campus_2013_14',
         'total_price_for_out_of_state_students_living_on_campus_2013_14',
         'percent_of_total_enrollment_that_are_white',
         'percent_of_undergraduate_enrollment_that_are_women',
         'graduation_rate___bachelor_degree_within_5_years,_total',
         'percent_of_freshmen_receiving_any_financial_aid']]
df=df.dropna() # this drops the rows that have missing values
df.describe()

In [None]:
# lets's start with a correlation matrix
df.corr()

In [None]:
# a kernel density plot may help too
df["estimated_undergraduate_enrollment,_total"].plot.kde()

In [None]:
# some schools look to be really large or really small
# how can we see this data?
df.sort_values(by=["estimated_undergraduate_enrollment,_total"])

In [None]:
# next let's look at some scatter plots
df.plot.scatter(x='act_composite_75th_percentile_score',y='graduation_rate___bachelor_degree_within_5_years,_total')

In [None]:
# how about an OLS?
YVar = df[["graduation_rate___bachelor_degree_within_5_years,_total"]]
XVar = df[['applicants_total', 'admissions_total',
         'enrolled_total',
         'act_composite_75th_percentile_score',
         'estimated_undergraduate_enrollment,_total',
         'total_price_for_in_state_students_living_on_campus_2013_14',
         'total_price_for_out_of_state_students_living_on_campus_2013_14',
         'percent_of_total_enrollment_that_are_white',
         'percent_of_undergraduate_enrollment_that_are_women',
         'percent_of_freshmen_receiving_any_financial_aid']]
LinearModel = sm.OLS(YVar, XVar).fit()
print(LinearModel.summary())

In [None]:
# Let's plot the residuals to see how they are distributed
LinearModel.resid.plot.kde()

In [None]:
# let's take a look at the residuals after mapping them to the names
# of the colleges or universities
residuals = pd.concat([df['name'], LinearModel.resid], axis=1)
residuals.sort_values(by=[0])