In [30]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np

Import the admissions data.  Drop missing values, and as we did in the lecture, turn `prestige` into a dummy and drop prestige==4 so that it is the reference level.

In [5]:
df = pd.read_csv('../datasets/admissions.csv')

In [6]:
df.dropna(inplace=True)

In [7]:
dummy_ranks = pd.get_dummies(df["prestige"], prefix="prestige")
cols_to_keep = ["admit", "gre", "gpa"]
df = df[cols_to_keep].join(dummy_ranks).drop('prestige_4.0', axis=1)
df.head()

Unnamed: 0,admit,gre,gpa,prestige_1.0,prestige_2.0,prestige_3.0
0,0,380.0,3.61,0,0,1
1,1,660.0,3.67,0,0,1
2,1,800.0,4.0,1,0,0
3,1,640.0,3.19,0,0,0
4,0,520.0,2.93,0,0,0


Using [sklearn](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html), fit the logistic regression to the data, and print the coefficients and the intercept.



In [8]:
logreg = LogisticRegression()
y = df['admit']
X = df.drop('admit', axis=1)

logreg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
print logreg.intercept_

[-2.07018745]


In [10]:
print logreg.coef_

[[  1.58889206e-03   1.84630696e-04   1.16761197e+00   5.26947989e-01
   -3.80822681e-02]]


Did the coefficients match what we saw with statsmodel?  If not, why not?  Look closely at the documentation... what is the default behaviour?

In [11]:
# by default, sklearn adds some regularization.  Removing it below by making C very large.
logreg = LogisticRegression(C=10000000000)
y = df['admit']
X = df.drop('admit', axis=1)

logreg.fit(X,y)
print logreg.intercept_
print logreg.coef_

[-5.3857036]
[[ 0.00217157  0.78165799  1.51273846  0.85283151  0.18807633]]


Do they match now?  If there's a small difference, why?

In [12]:
# logistic regressions are solved by iteratively looking for the optimum solution -- the optimizing 
# functions may have found slightly different local mimima

Make a prediction of probability of admission for a student with GRE = 700, GPA = 3.8, prestige level 1.  Interpret your result.

*hint*: use the `predict_proba` method... 

In [36]:
logreg.predict_proba(np.array([700, 3.8, 1, 0, 0]).reshape(1,-1))
# student predicted to be 65% likely to be admitted

array([[ 0.35036544,  0.64963456]])

Now use the `predict` method.  How do you think it is making the predictions?  (how does this relate to `predict_proba`)?

In [14]:
logreg.predict([700, 3.8, 1, 0, 0])
#if probability that admit = 1 > 0.5, classified as 1



array([1])

In [15]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

In [17]:
# base case
scores = cross_val_score(logreg, X, y, cv=5)
print scores.mean(), scores.std()

0.705360212533 0.022308325047


In [33]:
# lasso
lasso_cv = LogisticRegressionCV(Cs=200, penalty='l1', solver='liblinear')
lasso_cv.fit(X, y)
best_c = lasso_cv.C_[0]
lasso = LogisticRegression(penalty='l1', C=best_c)
scores = cross_val_score(lasso, X, y, cv=5)
print scores.mean(), scores.std()

0.705360212533 0.0192217788094


In [34]:
# ridge
ridge_cv = LogisticRegressionCV(Cs=20, penalty='l2')
ridge_cv.fit(X, y)
best_c = ridge_cv.C_[0]
ridge = LogisticRegression(penalty='l2', C=best_c)
scores = cross_val_score(ridge, X, y, cv=5)
print scores.mean(), scores.std()

0.702828566964 0.0264168672227


In [None]:
# not a huge difference!  
