# DS-SF-27 | Unit Project 3: Basic Machine Learning Modeling

In this project, you will perform a logistic regression on the admissions data we've been working with in Unit Projects 1 and 2.

In [48]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.notebook_repr_html', True)

import statsmodels.formula.api as smf
import seaborn as sns
import math as math

from sklearn import linear_model

%matplotlib inline


In [49]:
df = pd.read_csv(os.path.join('..', '..', 'dataset', 'ucla-admissions.csv'))
df.dropna(inplace = True)

df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.00,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0
...,...,...,...,...
395,0,620.0,4.00,2.0
396,0,560.0,3.04,3.0
397,0,460.0,2.63,2.0
398,0,700.0,3.65,2.0


## Part A.  Frequency Table

> ### Question 1.  Create a frequency table for `prestige` and whether or not an applicant was admitted.

In [50]:
pd.crosstab(df.prestige,
    df.admit,
    rownames = ['Highschool Prestige'],
    colnames = ['Admit'])

Admit,0,1
Highschool Prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,28,33
2.0,95,53
3.0,93,28
4.0,55,12


## Part B.  Variable Transformations

> ### Question 2.  Create a one-hot encoding for `prestige`.

In [51]:
# TODO
c = df.prestige

In [52]:
c

0      3.0
1      3.0
2      1.0
3      4.0
4      4.0
      ... 
395    2.0
396    3.0
397    2.0
398    2.0
399    3.0
Name: prestige, dtype: float64

In [53]:
prestige_df = pd.get_dummies(c, prefix = 'Prestige')

In [54]:
prestige_df

Unnamed: 0,Prestige_1.0,Prestige_2.0,Prestige_3.0,Prestige_4.0
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
395,0.0,1.0,0.0,0.0
396,0.0,0.0,1.0,0.0
397,0.0,1.0,0.0,0.0
398,0.0,1.0,0.0,0.0


> ### Question 3.  How many of these binary variables do we need for modeling?

Answer: At most three. If we know the value of three of the binary variables, then we will know the value of the fourth.

> ### Question 4.  Why are we doing this?

Answer: We do this because we want to treat each prestige score as a discrete feature in our model.

> ### Question 5.  Add all these binary variables in the dataset and remove the now redundant `prestige` feature.

In [55]:
# TODO
prestige_df

Unnamed: 0,Prestige_1.0,Prestige_2.0,Prestige_3.0,Prestige_4.0
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
395,0.0,1.0,0.0,0.0
396,0.0,0.0,1.0,0.0
397,0.0,1.0,0.0,0.0
398,0.0,1.0,0.0,0.0


In [56]:
prestige_df.rename(columns = {'Prestige_1.0': 'Prestige_1',
                              'Prestige_2.0': 'Prestige_2',
                              'Prestige_3.0': 'Prestige_3',
                              'Prestige_4.0': 'Prestige_4'}, inplace = True)

In [57]:
prestige_df

Unnamed: 0,Prestige_1,Prestige_2,Prestige_3,Prestige_4
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
395,0.0,1.0,0.0,0.0
396,0.0,0.0,1.0,0.0
397,0.0,1.0,0.0,0.0
398,0.0,1.0,0.0,0.0


In [58]:
df = df.join([prestige_df])

In [59]:
df.columns

Index([u'admit', u'gre', u'gpa', u'prestige', u'Prestige_1', u'Prestige_2',
       u'Prestige_3', u'Prestige_4'],
      dtype='object')

In [60]:
df.drop('prestige', 1)

Unnamed: 0,admit,gre,gpa,Prestige_1,Prestige_2,Prestige_3,Prestige_4
0,0,380.0,3.61,0.0,0.0,1.0,0.0
1,1,660.0,3.67,0.0,0.0,1.0,0.0
2,1,800.0,4.00,1.0,0.0,0.0,0.0
3,1,640.0,3.19,0.0,0.0,0.0,1.0
4,0,520.0,2.93,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
395,0,620.0,4.00,0.0,1.0,0.0,0.0
396,0,560.0,3.04,0.0,0.0,1.0,0.0
397,0,460.0,2.63,0.0,1.0,0.0,0.0
398,0,700.0,3.65,0.0,1.0,0.0,0.0


## Part C.  Hand calculating odds ratios

Let's develop our intuition about expected outcomes by hand calculating odds ratios.

> ### Question 6.  Create a frequency table for `prestige = 1` and whether or not an applicant was admitted.

In [61]:
# TODO
pd.crosstab(df.Prestige_1,
    df.admit,
    rownames = ['Prestige_1'],
    colnames = ['Admit'])

Admit,0,1
Prestige_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,243,93
1.0,28,33


> ### Question 7.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the most prestigious undergraduate schools.

In [62]:
# TODO
33/61.0

#odds of 33:28

0.5409836065573771

In [63]:
prob_A = 33 / 61.
odds_A = prob_A / (1 - prob_A)

print prob_A
print odds_A

0.540983606557
1.17857142857


> ### Question 8.  Now calculate the odds of admission for undergraduates who did not attend a #1 ranked college.

In [64]:
# TODO
prob_B = 93 / (243+93.)
odds_B = prob_B / (1 - prob_B)

print prob_B
print odds_B
#odds of 93:243 or 31:81

0.276785714286
0.382716049383


> ### Question 9.  Finally, what's the odds ratio?

In [65]:
# TODO              prestige 1       not prestige 1
# admitted           33                  93

# not admitted       28                  243

#(33.0*243)/(28*93)
odds_A/odds_B

3.079493087557604

> ### Question 10.  Write this finding in a sentenance.

Answer: If you are from a prestige 1 high school, you are approx. 3 times more likely to be admitted than if you do not come from a prestige 1 high school

> ### Question 11.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the least prestigious undergraduate schools.  Then calculate their odds ratio of being admitted to UCLA.  Finally, write this finding in a sentenance.

In [66]:
# TODO
pd.crosstab(df.Prestige_4,
    df.admit,
    rownames = ['Prestige_4'],
    colnames = ['Admit'])

Admit,0,1
Prestige_4,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,216,114
1.0,55,12


In [67]:
(12.0*216)/(55*114)

0.4133971291866029

Answer: odds of 12:114 or 6:57.

If you are from a prestige 4 high school you are approx. 0.4 times as likely to be admitted to UCLA than if you are not from a prestige 4 school.

## Part C. Analysis using `statsmodels`

> ### Question 12.  Fit a logistic regression model prediting admission into UCLA using `gre`, `gpa`, and the prestige of the undergraduate schools.  Use the highest prestige undergraduate schools as your reference point.

In [68]:
# TODO
admission = smf.ols(formula = 'admit ~ gre + gpa + Prestige_2 + Prestige_3 + Prestige_4', data = df).fit()

> ### Question 13.  Print the model's summary results.

In [69]:
# TODO
admission.summary()

0,1,2,3
Dep. Variable:,admit,R-squared:,0.099
Model:,OLS,Adj. R-squared:,0.087
Method:,Least Squares,F-statistic:,8.594
Date:,"Mon, 24 Oct 2016",Prob (F-statistic):,9.71e-08
Time:,21:21:46,Log-Likelihood:,-239.02
No. Observations:,397,AIC:,490.0
Df Residuals:,391,BIC:,513.9
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.2377,0.217,-1.095,0.274,-0.665 0.189
gre,0.0004,0.000,1.997,0.047,6.48e-06 0.001
gpa,0.1508,0.064,2.349,0.019,0.025 0.277
Prestige_2,-0.1635,0.068,-2.407,0.017,-0.297 -0.030
Prestige_3,-0.2910,0.070,-4.139,0.000,-0.429 -0.153
Prestige_4,-0.3240,0.079,-4.082,0.000,-0.480 -0.168

0,1,2,3
Omnibus:,152.312,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,50.314
Skew:,0.678,Prob(JB):,1.19e-11
Kurtosis:,1.904,Cond. No.,6070.0


> ### Question 14.  What are the odds ratios of the different features and their 95% confidence intervals?

In [70]:
# TODO

#gre; coeff of 0.0004
print "GRE odds ratio: ", math.exp(0.0004), "; 95% conf interval:", math.exp(6.48e-06), "-", math.exp(0.001)

#gpa; coeff of 0.1508
print "GPA odds ratio: ", math.exp(0.1508), "; 95% conf interval:", math.exp(0.025), "-", math.exp(0.277)

#Prestige_2; coeff of -0.1635 
print "Prestige 2 odds ratio: ", math.exp(-0.1635), "; 95% conf interval:", math.exp(-0.297), "-", math.exp(-0.030)

#Prestige_3; coeff of -0.2910
print "Prestige 3 odds ratio: ", math.exp(-0.2910), "; 95% conf interval:", math.exp(-0.429), "-", math.exp(0.153)

#Prestige_4; coeff of -0.1828
print "Prestige 4 odds ratio: ", math.exp(-0.3240), "; 95% conf interval:", math.exp(-0.480), "-", math.exp(-0.168)


GRE odds ratio:  1.00040008001 ; 95% conf interval: 1.00000648002 - 1.00100050017
GPA odds ratio:  1.16276408201 ; 95% conf interval: 1.02531512052 - 1.31916637103
Prestige 2 odds ratio:  0.849166499002 ; 95% conf interval: 0.743044012362 - 0.970445533549
Prestige 3 odds ratio:  0.747515678018 ; 95% conf interval: 0.651159929181 - 1.16532497894
Prestige 4 odds ratio:  0.72325024238 ; 95% conf interval: 0.618783391806 - 0.845353834685


> ### Question 15.  Interpret the odds ratio for `prestige = 2`.

Answer: A student is approx. 0.15 less likely to get into UCLA if they attended a prestige 2 school vs. a prestige 1 school.

> ### Question 16.  Interpret the odds ratio of `gpa`.

Answer: As GPA increases by one, a student is 1.16 times more likely to be admitted.

> ### Question 17.  Assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [71]:
# TODO

Answer:

## Part D. Moving the model from `statsmodels` to `sklearn`

> ### Question 18.  Let's assume we are satisfied with our model.  Remodel it (same features) using `sklearn`.  When creating the logistic regression model with `LogisticRegression(C = 10 ** 2)`.

In [77]:
# TODO
X = df[["gre","gpa", "Prestige_1", "Prestige_2", "Prestige_3", "Prestige_4"]]
y = df.admit

model = linear_model.LogisticRegression().\
    fit(X, y)

print model.intercept_
print model.coef_
print model.score(X,y)

 [-1.95266678]
[[ 0.00172658  0.19596439  0.35943456 -0.31929464 -0.88821548 -1.10459122]]
0.712846347607


> ### Question 19.  What are the odds ratios for the different variables and how do they compare with the odds ratios calculated with `statsmodels`?

In [78]:
# TODO
print "GRE odds ratio: ", math.exp(0.00178497)
print "GPA odds ratio: ", math.exp(0.19596439)
print "Prestige 1 odds ratio: ", math.exp(0.35943456)
print "Prestige 2 odds ratio: ", math.exp(-0.31929464)
print "Prestige 3 odds ratio: ", math.exp(-0.88821548)
print "Prestige 4 odds ratio: ", math.exp(-1.10459122)

GRE odds ratio:  1.00178656401
GPA odds ratio:  1.21648358558
Prestige 1 odds ratio:  1.43251918187
Prestige 2 odds ratio:  0.726661414242
Prestige 3 odds ratio:  0.411389230414
Prestige 4 odds ratio:  0.331346302303


Answer: They are lower in the sklearn model.

> ### Question 20.  Again assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [79]:
# TODO
predict_Tier_1 = [[800, 4.0, 1,0,0,0]]
predict_Tier_2 = [[800, 4.0, 0,1,0,0]]
predict_Tier_3 = [[800, 4.0, 0,0,1,0]]
predict_Tier_4 = [[800, 4.0, 0,0,0,1]]

print "Tier 1 Prediction:", model.predict(predict_Tier_1)
print "Tier 1 Prob:", model.predict_proba(predict_Tier_1)

print "Tier 2 Prediction:", model.predict(predict_Tier_2)
print "Tier 2 Prob:", model.predict_proba(predict_Tier_2)
5
print "Tier 3 Prediction:", model.predict(predict_Tier_3)
print "Tier 3 Prob:", model.predict_proba(predict_Tier_3)

print "Tier 4 Prediction:", model.predict(predict_Tier_4)
print "Tier 4 Prob:", model.predict_proba(predict_Tier_4)

Tier 1 Prediction: [1]
Tier 1 Prob: [[ 0.36080011  0.63919989]]
Tier 2 Prediction: [0]
Tier 2 Prob: [[ 0.52668364  0.47331636]]
Tier 3 Prediction: [0]
Tier 3 Prob: [[ 0.66279102  0.33720898]]
Tier 4 Prediction: [0]
Tier 4 Prob: [[ 0.70933008  0.29066992]]


Answer: The probabilities for Tier 1, 2, 3, and 4 are 64%, 47%, 34% and 29% respectively.