In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import statsmodels.api as sm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import pylab as pl
import numpy as np

In [2]:
df_raw = pd.read_csv('DAT-NYC-4.18.17/projects/unit-projects/project-3/assets/admissions.csv')
df = df_raw.dropna() 
print df.head()

   admit    gre   gpa  prestige
0      0  380.0  3.61       3.0
1      1  660.0  3.67       3.0
2      1  800.0  4.00       1.0
3      1  640.0  3.19       4.0
4      0  520.0  2.93       4.0


#### 1.1 Create a frequency table for prestige and whether or not someone was admitted

In [3]:
admitted_prestige = pd.crosstab(index=df["admit"], columns=df["prestige"])

admitted_prestige

prestige,1.0,2.0,3.0,4.0
admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,28,95,93,55
1,33,53,28,12


#### 2.1 Create dummy variables for prestige

In [4]:
dummies_prestige = pd.get_dummies(df['prestige'], prefix='prestige')

dummies_prestige.head()

df_complete = pd.concat([df, dummies_prestige], axis=1)

df_complete.head()

Unnamed: 0,admit,gre,gpa,prestige,prestige_1.0,prestige_2.0,prestige_3.0,prestige_4.0
0,0,380.0,3.61,3.0,0,0,1,0
1,1,660.0,3.67,3.0,0,0,1,0
2,1,800.0,4.0,1.0,1,0,0,0
3,1,640.0,3.19,4.0,0,0,0,1
4,0,520.0,2.93,4.0,0,0,0,1


#### 2.2 When modeling our class variables, how many do we need?

We only need three, since the fourth can be inferred from the other three. For example, if prestige is not 1, 2, or 3, it's 4.

#### 3.1 Hand Calculating Odds Ratios: Using the crosstab from above, calculate the odds of being admitted to grad school if you attended a #1 ranked college

From above, if you attended a #1 ranked college, the odds are 33 to 28 that you will be admitted.

#### 3.2 Now calculate the odds of admission if you did not attend a #1 ranked college

In [5]:
print 53 + 28 + 12
print 95 + 93 + 55


93
243


If you did not attend a #1 ranked college, the odds of admission are 93 to 243.

#### 3.3: Calculate the odds ratio

In [6]:
float(33 * 243) / float(28) / float(93)


3.079493087557604

#### 3.4 Write this finding in a sentence

The odds of admission are approximately three times higher if you attended a #1 college than if you did not.

#### 3.5 Print the cross tab for prestige_4

In [7]:
prestige4 = pd.crosstab(df_complete['prestige_4.0'], df_complete['admit'])

prestige4

admit,0,1
prestige_4.0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,216,114
1,55,12


#### 3.6 Calculate the OR

In [8]:
float(12) / float(55) * float(216) / float(114)

0.4133971291866029

#### 3.7 Write this finding in a sentence

The odds of admission are 41% as high if your school was prestige_4 (lowest) than if it was not.

## Part 4: Analysis

In [9]:
#Create a clean data frame for the regression
cols_to_keep = ['admit', 'gre', 'gpa']
data = df[cols_to_keep].join(dummies_prestige.ix[:,'prestige_2.0':])
print data.head()

   admit    gre   gpa  prestige_2.0  prestige_3.0  prestige_4.0
0      0  380.0  3.61             0             1             0
1      1  660.0  3.67             0             1             0
2      1  800.0  4.00             0             0             0
3      1  640.0  3.19             0             0             1
4      0  520.0  2.93             0             0             1


In [10]:
data['intercept'] = 1.0

#### 4.1 Set the covariates to a variable called train_cols

In [11]:
train_cols = data.columns[1:]

print train_cols

Index([u'gre', u'gpa', u'prestige_2.0', u'prestige_3.0', u'prestige_4.0',
       u'intercept'],
      dtype='object')


#### 4.2 Fit the model

In [12]:
logit = sm.Logit(data['admit'], data[train_cols])

result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.573854
         Iterations 6


#### 4.3 Print the summary results

In [13]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  397
Model:                          Logit   Df Residuals:                      391
Method:                           MLE   Df Model:                            5
Date:                Thu, 25 May 2017   Pseudo R-squ.:                 0.08166
Time:                        18:24:04   Log-Likelihood:                -227.82
converged:                       True   LL-Null:                       -248.08
                                        LLR p-value:                 1.176e-07
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
gre              0.0022      0.001      2.028      0.043      7.44e-05     0.004
gpa              0.7793      0.333      2.344      0.019         0.128     1.431
prestige_2.0    -0.6801      0.317     -2.14

#### 4.4 Calculate the odds ratios of the coefficients and their 95% CI intervals

In [14]:
print np.exp(result.params)

gre             1.002221
gpa             2.180027
prestige_2.0    0.506548
prestige_3.0    0.262192
prestige_4.0    0.211525
intercept       0.020716
dtype: float64


In [15]:
print result.conf_int()

                     0         1
gre           0.000074  0.004362
gpa           0.127619  1.431056
prestige_2.0 -1.301337 -0.058936
prestige_3.0 -2.014579 -0.662776
prestige_4.0 -2.371624 -0.735197
intercept    -6.116077 -1.637631


#### 4.5 Interpret the OR of Prestige_2

The odds of admission decrease by 50% if the applicant attended a school with prestige of 2.

#### 4.6 Interpret the OR of GPA

The odds of admission increase by 218% when GPA increases by a percentage point.

## Part 5: Predicted Probabilities

As a way of evaluating our classifier, we're going to recreate the dataset with every logical combination of input values. This will allow us to see how the predicted probability of admission increases/decreases across different variables. First we're going to generate the combinations using a helper function called cartesian (above).

We're going to use np.linspace to create a range of values for "gre" and "gpa". This creates a range of linearly spaced values from a specified min and maximum value--in our case just the min/max observed values.

In [16]:
def cartesian(arrays, out=None):
    """
    Generate a cartesian product of input arrays.
    Parameters
    ----------
    arrays : list of array-like
        1-D arrays to form the cartesian product of.
    out : ndarray
        Array to place the cartesian product in.
    Returns
    -------
    out : ndarray
        2-D array of shape (M, len(arrays)) containing cartesian products
        formed of input arrays.
    Examples
    --------
    >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
    array([[1, 4, 6],
           [1, 4, 7],
           [1, 5, 6],
           [1, 5, 7],
           [2, 4, 6],
           [2, 4, 7],
           [2, 5, 6],
           [2, 5, 7],
           [3, 4, 6],
           [3, 4, 7],
           [3, 5, 6],
           [3, 5, 7]])
    """

    arrays = [np.asarray(x) for x in arrays]
    dtype = arrays[0].dtype

    n = np.prod([x.size for x in arrays])
    if out is None:
        out = np.zeros([n, len(arrays)], dtype=dtype)

    m = n / arrays[0].size
    out[:,0] = np.repeat(arrays[0], m)
    if arrays[1:]:
        cartesian(arrays[1:], out=out[0:m,1:])
        for j in xrange(1, arrays[0].size):
            out[j*m:(j+1)*m,1:] = out[0:m,1:]
    return out

In [17]:
# instead of generating all possible values of GRE and GPA, we're going
# to use an evenly spaced range of 10 values from the min to the max 
gres = np.linspace(data['gre'].min(), data['gre'].max(), 10)
print gres
# array([ 220.        ,  284.44444444,  348.88888889,  413.33333333,
#         477.77777778,  542.22222222,  606.66666667,  671.11111111,
#         735.55555556,  800.        ])
gpas = np.linspace(data['gpa'].min(), data['gpa'].max(), 10)
print gpas
# array([ 2.26      ,  2.45333333,  2.64666667,  2.84      ,  3.03333333,
#         3.22666667,  3.42      ,  3.61333333,  3.80666667,  4.        ])


# enumerate all possibilities
combos = pd.DataFrame(cartesian([gres, gpas, [1, 2, 3, 4], [1.]]))

[ 220.          284.44444444  348.88888889  413.33333333  477.77777778
  542.22222222  606.66666667  671.11111111  735.55555556  800.        ]
[ 2.26        2.45333333  2.64666667  2.84        3.03333333  3.22666667
  3.42        3.61333333  3.80666667  4.        ]


#### 5.1 Recreate the dummy variables
Keep only what we need for making predictions

In [18]:
combos.columns = ['gre', 'gpa', 'prestige', 'intercept']

dummy_ranks = pd.get_dummies(combos['prestige'], prefix='prestige')
dummy_ranks.columns = ['prestige_1.0', 'prestige_2.0', 'prestige_3.0', 'prestige_4.0']

#Keep only what we need:
cols_crucial = ['gre', 'gpa', 'prestige', 'intercept']
combos = combos[cols_crucial].join(dummy_ranks.ix[:, 'prestige_2.0':])

#### 5.2 Make predictions on the enumerated dataset

In [22]:
combos['admit_predictor'] = result.predict(combos[train_cols])

combos.head()

Unnamed: 0,gre,gpa,prestige,intercept,prestige_2.0,prestige_3.0,prestige_4.0,admit_predictor
0,220.0,2.26,1.0,1.0,0,0,0,0.164173
1,220.0,2.26,2.0,1.0,1,0,0,0.090492
2,220.0,2.26,3.0,1.0,0,1,0,0.048977
3,220.0,2.26,4.0,1.0,0,0,1,0.03989
4,220.0,2.453333,1.0,1.0,0,0,0,0.185907


#### 5.3 Interpret findings for the last 4 observations

In [21]:

combos.tail(4)

Unnamed: 0,gre,gpa,prestige,intercept,prestige_2.0,prestige_3.0,prestige_4.0,admit_predictor
396,800.0,4.0,1.0,1.0,0,0,0,0.73404
397,800.0,4.0,2.0,1.0,1,0,0,0.582995
398,800.0,4.0,3.0,1.0,0,1,0,0.419833
399,800.0,4.0,4.0,1.0,0,0,1,0.368608


Interpretation: if an applicant has a perfect GRE and GPA, then if they went to:

Tier 1 School: 73% chance of admittance

Tier 2 School: 58% chance of admittance

Tier 3 School: 42% chance of admittance

Tier 4 School: 37% chance of amittance.

## Bonus
Plot the probability of being admitted into graduate school, stratified by GPA and GRE score.