In [None]:
!pip install --upgrade pip # just in case you need to install or upgrade pip
!pip install statsmodels --upgrade # installing statsmodels if you don't have it already
import pandas as pd # we've seen pandas before
import statsmodels.api as sm # this is the modeling library that we will use

In [None]:
# be sure to have the csv file questselections18.csv in the same directory as this ipynb
df = pd.read_csv("questselections18.csv")

In [None]:
# we begin with a simple examination of the first few rows of the dataframe
# notice that there are six columns, which I define here:
#
# id (Ordinal) - a unique identifier for each application received by QUEST in 2018
# note that applications that are received earlier have lower id scores
# some id's are missing because a student may have started an application but
# didn't submit it
#
# credits (Ratio) - an integer that is a count of the number of college credits an
# applicant has when they submit their application.  Note that the number of credits
# may be greater than zero even when a student applies in the fall semester because
# of AP or similar credits they bring to the University of Maryland
#
# gpa (Ratio) - what is the current college gpa of the applicant
#
# school (Nominal) - this can be "CMNS" or "Clark" or "Smith"
#
# score (Interval) - what is the average score of the quality of the application based on alumni
# and current student reviewers
#
# interview (Ordinal or Categorical?) - a binary variable that is '1' if the applicant is invited to come and interview
# and '0' if the applicant is not invited

df.head()

In [None]:
# descriptive statistics of the quantitative variables
df.describe()

In [None]:
# you might recall that there are xome GPAs reported as zero (3 of them)
# you can see this from the descriptive statistics above
# therefore, we want to drop the rows where this is the case
df=df[df['gpa'] != 0]
df.describe() # we can see that the min gpa is no longer zero

In [None]:
# let's do an OLS regression
YVar = df[["score"]] # we set up the dependent variable as the score of the applicant
XVar = df[["credits", "gpa"]] # we now use two dependent variables including credits and gpa
LinearModel = sm.OLS(YVar, XVar).fit() # this creates the linear model
print(LinearModel.summary()) # this prints out the results of the linear model

In [None]:
# let's take a look at the correlation matrix of independent variables
XVar.corr()

In [None]:
# let's now do a logistic regression where we try to predict whether or not the applicant
# gets an interview
YVar = df[["interview"]] # this is our dependent variable
XVar = df[["score", "credits", "gpa"]] # these are our independent variables
LogisticModel = sm.Logit(YVar, XVar).fit() # this is the logistic regression
print(LogisticModel.summary()) # this prints out the results of the model

In [None]:
# With your team, caan you and your team add the school variable as a binary variable and include
# it in the OLS and logistic regressions.  Is it statistically significant?