In [None]:
#imports
import numpy as np
import pandas as pd

from statsmodels.tools import add_constant
from statsmodels.discrete.discrete_model import Logit

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

import matplotlib.pyplot as plt
plt.style.use("ggplot")

from roc_curve import *

## Part 1: ROC Curve

1. Write an ROC curve function to compute the above in `roc_curve.py`.

    See [roc_curve.py](roc_curve.py)

2. Run the above code to verify that it's working correctly.

In [None]:
run_fake_data()

3. Let's see how the roc curve looks on a real dataset.

In [None]:
run_loan_data()

## Part 2: Data Exploration: Graduate School Admissions

1. Load in the dataset into pandas: `data/grad.csv`.

In [None]:
df = pd.read_csv('data/grad.csv')

In [None]:
df.head()

2. Use the pandas `describe` method to get some preliminary summary statistics on the data.

In [None]:
df.describe()

3. Make a bar plot of the percent of applicants from each rank who were accepted.

In [None]:
admit = pd.crosstab(df['admit'], df['rank'], rownames=['admit'])
(admit / admit.apply(sum)).plot(kind="bar", figsize=(12, 6));

4. What does the distribution of the GPA and GRE scores look like? Do the distributions differ much?

In [None]:
df.hist(figsize=(12, 8));

   *The distributions of GPA and GRE actually look quite similar, possibly normally distributed slightly skewed to the left (negative skew) centered around the means of GPA and GRE computed above. And for GPAs there is an anomolous bump near 4.0s.*

5. One of the issues with classification can be unbalanced classes. What percentage of the data was admitted? Do you think this will be a problem?

In [None]:
df['admit'].value_counts() / len(df)

   *Classes aren't too imbalanced so you should be fine.
    When dealing with data where the label could potentially be something that is biased one way or the other (such as acceptance, fraud, signups, anything where one label is more preferential to the other or deals with some measure of "success") you should verify. Actually you should most always verify.*

## Part 3: Predicting Graduate School Admissions

1. Use statsmodels to fit a Logistic Regression.

In [None]:
X = df[['gre', 'gpa', 'rank']].values
X_const = add_constant(X, prepend=True)
y = df['admit'].values

logit_model = Logit(y, X_const).fit()

2. Use the `summary` method to see your results.

In [None]:
logit_model.summary()

Note that the p-values are all smaller than 0.05, so we are very happy with this model.

3. Use sklearn's [KFold cross validation](http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html) to calculate the average accuracy, precision and recall.

In [None]:
kfold = KFold(n_splits=10)

accuracies = []
precisions = []
recalls = []

X_train, X_test, y_train, y_test = train_test_split(X, y)

for train_index, test_index in kfold.split(X_train):
    model = LogisticRegression(solver="lbfgs")
    model.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    y_predict = model.predict(X_train.iloc[test_index])
    y_true = y_train.iloc[test_index]
    accuracies.append(accuracy_score(y_true, y_predict))
    precisions.append(precision_score(y_true, y_predict))
    recalls.append(recall_score(y_true, y_predict))

print("Accuracy:", np.average(accuracies))
print("Precision:", np.average(precisions))
print("Recall:", np.average(recalls))

4. The `rank` column is numerical, but as it has 4 buckets, we could also consider it to be categorical. Use panda's [get_dummies](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to binarize the column.

In [None]:
dummies = pd.get_dummies(df['rank'], prefix='rank')
X2 = df[['gre','gpa']].join(dummies.loc[:,'rank_2':]).values

5. Compute the same metrics as above. Does it do better or worse with the rank column binarized?

In [None]:
accuracies = []
precisions = []
recalls = []

X2_train, X2_test, y_train, y_test = train_test_split(X2, y)

for train_index, test_index in kfold.split(X2_train):
    model = LogisticRegression(solver="lbfgs", max_iter=500)
    model.fit(X2_train.iloc[train_index], y_train.iloc[train_index])
    y_predict = model.predict(X2_train.iloc[test_index])
    y_true = y_train.iloc[test_index]
    accuracies.append(accuracy_score(y_true, y_predict))
    precisions.append(precision_score(y_true, y_predict))
    recalls.append(recall_score(y_true, y_predict))

print("Accuracy:", np.average(accuracies))
print("Precision:", np.average(precisions))
print("Recall:", np.average(recalls))

    *It seems to perform worse.*

6. Make a plot of the ROC curve (using your function defined in Part 1).

In [None]:
# plotting helper function
def simple_plot(ax, x, y, x_label, y_label, title):
    ax.plot(x, y)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)
probabilities = model.predict_proba(X_test)[:, 1]
tpr, fpr, thresholds = roc_curve(probabilities, y_test)

fig, ax = plt.subplots(figsize=(12, 6))

simple_plot(ax, fpr, tpr, 
            "False Positive Rate (1 - Specificity)", 
            "True Positive Rate (Sensitivity, Recall)", 
            "ROC Plot of Admissions Data")

7. Is it possible to pick a threshold where TPR > 60% and FPR < 40%? What is the threshold?

    *Yes. We can get a TPR of 62.5% and FPR of 33.8% with a threshold of 0.3617.*

    *Answers may vary!*

8. Say we are using this as a first step in the application process. We want to weed out clearly unqualified candidates, but not reject too many candidates. What might be a good choice of threshold?

    *We want to maximize the TPR and don't care as much about the FPR. With a threshold of 0.222, we can get a TPR of 96.9%. THe FPR will be 73.5%, but we are okay with this sacrifice in order to avoid false negatives.*

## Part 4: Interpreting the beta coefficients with the Odds Ratio

1. Fit a Logistic Regression model on all the data. What are the beta coefficients? You should have 3 values.

In [None]:
model = LogisticRegression(solver='lbfgs')
model.fit(X, y)

for name, coef in zip(df.columns[1:], model.coef_[0]):
    print("{0}: {1:0.4f}".format(name, coef))

2. Compute the change in odds ratio from a one unit change in each feature.

In [None]:
for i, coef in enumerate(model.coef_[0]):
    print("beta{0}: {1:0.5f}".format(i + 1, np.exp(coef)))

3. Write a sentence for each of the three features.

    *Increasing the GRE score by 1 point increases the chance of getting in by a factor of 1.00189.*

    *Increasing the GPA score by 1 point increases the chance of getting in by a factor of 1.37614.*

    *Improving the school's rank by 1 point (means decreasing the number) increases the chance of getting in by a factor of 1/0.54587=1.8319.*

4. What change is required to double my chances of admission?

In [None]:
for i, coef in enumerate(model.coef_[0]):
    print("beta{0}: {1:0.5f}".format(i + 1, np.log(2) / coef))

   *Increasing the GRE score by 367 points doubles the chance of getting in.*

   *Increasing the GPA score by 2.17 points doubles the chance of getting in.*

   *Decreasing (improving) the school rank by 1.14 doubles the chance of getting in.*

## Part 5: Predicted Probabilities

Now let's actually play with our data to verify what we calculated above with the Odds Ratio.

1. Create a new feature matrix which has four rows. It should have each of the four possible values for the rank and keep the GRE and GPA values fixed. Use the mean value as a reasonable value to fix them at.

In [None]:
gre = df['gre'].mean()
gpa = df['gpa'].mean()
feature_matrix = []
ranks = [1, 2, 3, 4]
for rank in ranks:
    feature_matrix.append([gre, gpa, rank])
X_rank = np.array(feature_matrix)

2. Fit the Logistic Regression model on all the data and then use the model's `predict_proba` method to compute the predicted probabilities of this fake feature matrix. Also include the odds (`p/(1-p)`).

In [None]:
probabilities_rank = model.predict_proba(X_rank)[:, 1]
for rank, prob in zip(ranks, probabilities_rank):
    print("rank: {0}, probability: {1:0.5f}, odds: {2:0.5f}".format(rank, prob, prob / (1 - prob)))

3. Make a plot of the rank vs the probability.

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

simple_plot(ax, ranks, probabilities_rank, 
            "Rank", 
            "Probability",
            "Affect of Modifying the Rank on Probability of Acceptance")

In [None]:
# Another function to make things look nicer
def double_odds_plot(x, y, x_label):
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    title = "Affect of Modifying the {0} on Odds of Acceptance".format(x_label)
    log_title = "Affect of Modifying the {0} on Log Odds of Acceptance".format(x_label)
    
    simple_plot(axes[0], x, y, x_label, "Odds", title)
    
    simple_plot(axes[1], x, np.log(y), x_label, "Log Odds", log_title)

4. Make a plot of the rank vs the odds.

5. Since we really care about how a linear change in rank changes the probability by a multiplicative factor, we should do a graph of the rank vs the log of the odds.

In [None]:
odds_rank = probabilities_rank / (1 - probabilities_rank)

double_odds_plot(ranks, odds_rank, "Rank")

   Note that the slope of this line is approximately `(0 + 1.7) / (4 - 1) = -0.567`, which is approximately the beta coefficient.

6. Do the same analysis (#1-5) with the GRE and GPA scores. Each time, create a feature matrix with the other two columns fixed at the mean and every possible value of the column in question.

    ***Measuring the affect of modifying the GRE score on the probability of acceptance***

In [None]:
gpa = df['gpa'].mean()
rank = df['rank'].mean()
feature_matrix = []
gres = range(df['gre'].min(), df['gre'].max() + 1)
for gre in gres:
    feature_matrix.append([gre, gpa, rank])
X_gre = np.array(feature_matrix)

probabilities_gre = model.predict_proba(X_gre)[:, 1]
for gre, prob in zip(gres, probabilities_gre):
    print("gre: {0}, probability: {1:0.5f}, odds: {2:0.5f}".format(gre, prob, prob / (1 - prob)))

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

simple_plot(ax, gres, probabilities_gre,
            "GRE",
            "Probability",
            "Affect of Modifying the GRE on Probability of Acceptance")

In [None]:
odds_gre = probabilities_gre / (1 - probabilities_gre)

double_odds_plot(gres, odds_gre, "GRE")

   Note that the slope of this line is approximately `(-.4 + 1.55) / (800 - 220) = -0.00198`, which is approximately the beta coefficient.

    ***Measuring the affect of modifying the GPA score on the probability of acceptance***

In [None]:
gre = df['gre'].mean()
rank = df['rank'].mean()
feature_matrix = []
gpas = range(int(np.floor(df['gpa'].min())), int(np.ceil(df['gpa'].max() + 1)))
for gpa in gpas:
    feature_matrix.append([gre, gpa, rank])
X_gpa = np.array(feature_matrix)

probabilities_gpa = model.predict_proba(X_gpa)[:, 1]
for gpa, prob in zip(gpas, probabilities_gpa):
    print("gpa: {0}, probability: {1:0.5f}, odds: {2:0.5f}".format(gpa, prob, prob / (1 - prob)))

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

simple_plot(ax, gpas, probabilities_gpa,
            "GPA",
            "Probability",
            "Affect of Modifying the GPA on Probability of Acceptance")

In [None]:
odds_gpa = probabilities_gpa / (1 - probabilities_gpa)

double_odds_plot(gpas, odds_gpa, "GPA")