# Load Data and Final Clean

In [53]:
df <- read.csv(file="cleaned_data_0210.csv",sep=",",header=TRUE)
df = df[,-1]
colnames(df)
dim(df)

In [54]:
sum(df$Valid.term==0)

Scale and center some variables:

In [55]:
mean_age = mean(df$Start_age)
sd_age = sd(df$Start_age)
df$s_start_age = (df$Start_age - mean_age)/sd_age
mean_net = mean(df$net_cost)
sd_net = sd(df$net_cost)
df$s_net_cost = (df$net_cost - mean_net)/sd_net
mean_year = mean(df$Start_year)
sd_year = sd(df$Start_year)
df$s_start_year = (df$Start_year - mean_year)/sd_year

In [56]:
av_net_cost = ifelse(df$Valid.term > 0, df$net_cost / df$Valid.term, df$net_cost)
av_mean_net = mean(av_net_cost)
av_sd_net = sd(av_net_cost)
df$avs_net_cost = (av_net_cost - av_mean_net)/av_sd_net

## Seperate by School

Different Schools

In [57]:
schools = levels(factor(df$College))

In [58]:
colnames(Harpur_df)

In [59]:
Harpur_df = df[df$College %in% schools[1:4],]
Management_df = df[df$College=="GD Management",]
Nursing_df = df[df$College=="GD Nursing",]
Watson_df = df[df$College=="GD Watson",]

In [60]:
unique(Harpur_df$Major)

In [61]:
harpur_sciences <- c("Chemistry", "Behavioral Neuroscience","Cognitive Psychology","Physics","Mathematics",
                     "Biological Sciences","Materials Science and Engineer","Material Sci and Eng-Engineerg",
                     "Clinical Psychology","Economics","Geology","Political Science","Materials Science and Engr",
                     "Biological Sciences EEB", "X Geology","Mathematics/Economics")
harpur_arts <- c("Comparative Literature","Translation Studies","Educational Theory and Practic",
                 "Philosophy Interpretation and", "English","History","Anthropology","Sociology","Philosophy SPEL",
                 "Community and Public Affairs","Art History","Art History Combined MA PhD")

In [62]:
Harpur_sciences = Harpur_df[Harpur_df$Major %in% harpur_sciences,]
Harpur_arts = Harpur_df[Harpur_df$Major %in% harpur_arts,]

In [63]:
Harpur_sciences <- within(Harpur_sciences, Citizenship <- relevel(Citizenship, ref = "White"))
Harpur_arts <- within(Harpur_arts, Citizenship <- relevel(Citizenship, ref = "White"))
Management_df <- within(Management_df, Citizenship <- relevel(Citizenship, ref = "White"))
Watson_df <- within(Watson_df, Citizenship <- relevel(Citizenship, ref = "White"))
Nursing_df <- within(Nursing_df, Citizenship <- relevel(Citizenship, ref = "White"))

In [64]:
dim(Harpur_sciences)[1]
dim(Harpur_arts)[1]
dim(Harpur_df)[1]

In [65]:
a_n = dim(Harpur_sciences)[1]
a_1 = sum(Harpur_sciences$Y)
a_0 = a_n - a_1
b_n = dim(Harpur_arts)[1]
b_1 = sum(Harpur_arts$Y)
b_0 = b_n - b_1
c_n = dim(Management_df)[1]
c_1 = sum(Management_df$Y)
c_0 = c_n - c_1
d_n = dim(Nursing_df)[1]
d_1 = sum(Nursing_df$Y)
d_0 = d_n - d_1
e_n = dim(Watson_df)[1]
e_1 = sum(Watson_df$Y)
e_0 = e_n - e_1
School = c("Harpur sciences","Harpur_arts","SOM","Nursing","Watson")
sch_total = c(a_n,b_n,c_n,d_n,e_n)
succ_total = c(a_1,b_1,c_1,d_1,e_1)
unsucc_total = c(a_0,b_0,c_0,d_0,e_0)
data.frame("School"=School,"Total"=sch_total,"Successful Graduates"=succ_total,"Unsuccessful Graduates"=unsucc_total)

School,Total,Successful.Graduates,Unsuccessful.Graduates
Harpur sciences,861,542,319
Harpur_arts,918,534,384
SOM,58,38,20
Nursing,117,72,45
Watson,515,343,172


# Harpur Sciences Analysis

Won't use Degree column since it does not accurately show the student's major

<b> Logistic regression model:

In [66]:
log_mod_h_s = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                avs_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year,
                data=Harpur_sciences,family="binomial")
summary(log_mod_h_s)


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year, family = "binomial", data = Harpur_sciences)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.2237  -0.9486   0.5570   0.8600   1.8610  

Coefficients:
                                                      Estimate Std. Error
(Intercept)                                           -0.46872    0.26030
s_start_age                                            0.14235    0.14822
factor(Gender)Male                                     0.01721    0.16480
factor(funding_indicator)1                             1.29071    0.25627
factor(Citizenship)Africa                              0.97649    0.47506
factor(Citizenship)American Indian or Alaskan Native -14.07117  477.76347
factor(Citizenship)Asia                                0.29551    0.35407
factor(Citizenship)Asian           

<b> Significant Variables of logistic regression model: 

In [17]:
round(summary(log_mod_h_s)$coeff[which(summary(log_mod_h_s)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,1.29071,0.0
factor(Citizenship)Asian,-1.31525,0.03228
factor(Citizenship)Hispanic or Latino,-1.72355,0.00372
factor(Citizenship)White,-0.97649,0.03983
GPA_trend,1.15137,0.00717
s_start_year,-1.33429,0.0


Things we need
-  Run AIC/BIC stepwise model selection for Harpur and summarize results
-  Repeat for the other schools

<b> Backward AIC:

In [69]:
library('MASS')
log_mod_h_s_aic = stepAIC(log_mod_h_s, direction = 'both')

Start:  AIC=993.97
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                            Df Deviance     AIC
- factor(Gender)             1   947.98  991.98
- PC1                        1   948.09  992.09
- PC3                        1   948.10  992.10
- PC2                        1   948.53  992.53
- s_start_age                1   948.90  992.90
- factor(Citizenship)       12   971.87  993.87
<none>                           947.97  993.97
- avs_net_cost               1   950.34  994.34
- factor(gap_indicator)      1   951.71  995.71
- GPA_trend                  1   955.97  999.97
- factor(funding_indicator)  1   973.97 1017.97
- s_start_year               1  1068.24 1112.24

Step:  AIC=991.98
Y ~ s_start_age + factor(funding_indicator) + factor(Citizenship) + 
    avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year


<b> Significant variables backward AIC selects:

In [70]:
round(summary(log_mod_h_s_aic)$coeff[which(summary(log_mod_h_s_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
(Intercept),-0.46493,0.04864
factor(funding_indicator)1,1.24639,0.0
factor(Citizenship)Africa,1.02875,0.02585
factor(Citizenship)China,0.52352,0.01654
GPA_trend,1.10181,0.009
s_start_year,-1.32667,0.0


<b> Backward BIC

In [71]:
log_mod_h_s_bic = stepAIC(log_mod_h_s, direction = "both", k = log(a_n))

Start:  AIC=1103.4
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                            Df Deviance    AIC
- factor(Citizenship)       12   971.87 1046.2
- factor(Gender)             1   947.98 1096.7
- PC1                        1   948.09 1096.8
- PC3                        1   948.10 1096.8
- PC2                        1   948.53 1097.2
- s_start_age                1   948.90 1097.6
- avs_net_cost               1   950.34 1099.0
- factor(gap_indicator)      1   951.71 1100.4
<none>                           947.97 1103.4
- GPA_trend                  1   955.97 1104.6
- factor(funding_indicator)  1   973.97 1122.7
- s_start_year               1  1068.24 1216.9

Step:  AIC=1046.21
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                

<b> Significant variables backward BIC selects:

In [72]:
round(summary(log_mod_h_s_bic)$coeff[which(summary(log_mod_h_s_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,1.18085,0
s_start_year,-1.38659,0


# Harpur Arts Analysis

<b> Logistic regression model:

In [73]:
log_mod_h_a = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                avs_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year,
                data=Harpur_arts,family="binomial")
summary(log_mod_h_a)


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year, family = "binomial", data = Harpur_arts)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1252  -1.1704   0.7618   1.0063   1.6164  

Coefficients:
                                                        Estimate Std. Error
(Intercept)                                            -0.340435   0.166798
s_start_age                                             0.105559   0.070484
factor(Gender)Male                                     -0.006913   0.144635
factor(funding_indicator)1                              0.695422   0.172367
factor(Citizenship)Africa                              -0.389059   0.530128
factor(Citizenship)American Indian or Alaskan Native   -0.177376   0.883738
factor(Citizenship)Asia                                 0.693675   0.236959
factor(Citizenship)Asia

<b> Significant Variables of logistic regression model: 

In [74]:
round(summary(log_mod_h_a)$coeff[which(summary(log_mod_h_a)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
(Intercept),-0.34043,0.04125
factor(funding_indicator)1,0.69542,5e-05
factor(Citizenship)Asia,0.69367,0.00342
s_start_year,-0.52053,0.0


<b> Backward AIC:

In [75]:
log_mod_h_a_aic = stepAIC(log_mod_h_a, direction = 'both')

Start:  AIC=1216.17
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                            Df Deviance    AIC
- factor(Citizenship)       13   1184.6 1206.6
- factor(Gender)             1   1168.2 1214.2
- PC2                        1   1168.2 1214.2
- avs_net_cost               1   1168.2 1214.2
- factor(gap_indicator)      1   1168.9 1214.9
- PC3                        1   1169.3 1215.3
- GPA_trend                  1   1169.8 1215.8
<none>                           1168.2 1216.2
- PC1                        1   1170.2 1216.2
- s_start_age                1   1170.4 1216.4
- factor(funding_indicator)  1   1184.9 1230.9
- s_start_year               1   1218.4 1264.4

Step:  AIC=1206.64
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

               

<b> Significant variables backward AIC selects:

In [76]:
round(summary(log_mod_h_a_aic)$coeff[which(summary(log_mod_h_a_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,0.6368,8e-05
s_start_year,-0.48729,0.0


<b> Backward BIC

In [77]:
log_mod_h_a_bic = stepAIC(log_mod_h_a, direction = "both", k = log(b_n))

Start:  AIC=1331.91
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                            Df Deviance    AIC
- factor(Citizenship)       13   1184.6 1259.7
- factor(Gender)             1   1168.2 1325.1
- PC2                        1   1168.2 1325.1
- avs_net_cost               1   1168.2 1325.1
- factor(gap_indicator)      1   1168.9 1325.8
- PC3                        1   1169.3 1326.2
- GPA_trend                  1   1169.8 1326.8
- PC1                        1   1170.2 1327.1
- s_start_age                1   1170.4 1327.3
<none>                           1168.2 1331.9
- factor(funding_indicator)  1   1184.9 1341.8
- s_start_year               1   1218.4 1375.3

Step:  AIC=1259.68
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

               

<b> Significant variables backward BIC selects:

In [78]:
round(summary(log_mod_h_a_bic)$coeff[which(summary(log_mod_h_a_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,0.61318,0.00011
s_start_year,-0.4961,0.0


# Management Analysis

<b> Logistic regression model:

In [79]:
log_mod_m = glm(Y~s_start_age+factor(Gender)+factor(Citizenship)+
                avs_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year,
                data=Management_df,family="binomial")
summary(log_mod_m)


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(Citizenship) + 
    avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year, family = "binomial", data = Management_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1948  -0.8325   0.4263   0.8166   1.5902  

Coefficients:
                                        Estimate Std. Error z value Pr(>|z|)
(Intercept)                              1.45588    1.20046   1.213    0.225
s_start_age                              0.38243    0.73882   0.518    0.605
factor(Gender)Male                      -0.85210    0.75490  -1.129    0.259
factor(Citizenship)Asia                  0.29362    1.12511   0.261    0.794
factor(Citizenship)Asian                 1.57017    1.71928   0.913    0.361
factor(Citizenship)China                 0.73942    1.45250   0.509    0.611
factor(Citizenship)Europe               17.04912 3956.18052   0.004    0.997
factor(Citizenship)Hispanic or Latino    

<b> Significant Variables of logistic regression model: 

In [80]:
round(summary(log_mod_m)$coeff[which(summary(log_mod_m)$coef[,4] <= .05),c(1,4)],5)

Estimate,Pr(>|z|)


<b> Backward AIC:

In [81]:
log_mod_m_aic= stepAIC(log_mod_m, direction = 'both')

Start:  AIC=90.4
Y ~ s_start_age + factor(Gender) + factor(Citizenship) + avs_net_cost + 
    PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + s_start_year

                        Df Deviance    AIC
- factor(Citizenship)    7   58.210 78.210
- s_start_year           1   56.404 88.404
- PC3                    1   56.453 88.453
- s_start_age            1   56.672 88.672
- PC2                    1   56.753 88.753
- factor(Gender)         1   57.696 89.696
- PC1                    1   57.867 89.867
- GPA_trend              1   57.937 89.937
- avs_net_cost           1   58.312 90.312
<none>                       56.399 90.399
- factor(gap_indicator)  1   60.177 92.177

Step:  AIC=78.21
Y ~ s_start_age + factor(Gender) + avs_net_cost + PC1 + PC2 + 
    PC3 + factor(gap_indicator) + GPA_trend + s_start_year

                        Df Deviance    AIC
- s_start_age            1   58.227 76.227
- PC3                    1   58.418 76.418
- s_start_year           1   58.423 76.423
- PC2    

<b> Significant variables backward AIC selects:

In [82]:
summary(log_mod_m_aic)


Call:
glm(formula = Y ~ avs_net_cost + PC1 + factor(gap_indicator) + 
    GPA_trend, family = "binomial", data = Management_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4385  -0.8551   0.6446   0.8944   1.7533  

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)  
(Intercept)               1.388      0.645   2.152   0.0314 *
avs_net_cost              1.703      1.296   1.315   0.1886  
PC1                       2.163      1.202   1.800   0.0719 .
factor(gap_indicator)1   16.918   1692.845   0.010   0.9920  
GPA_trend                 2.840      1.716   1.656   0.0978 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 74.726  on 57  degrees of freedom
Residual deviance: 60.169  on 53  degrees of freedom
AIC: 70.169

Number of Fisher Scoring iterations: 16


<b> Backward BIC:

In [83]:
log_mod_m_bic = stepAIC(log_mod_m, direction = "both", k = log(c_n))

Start:  AIC=125.43
Y ~ s_start_age + factor(Gender) + factor(Citizenship) + avs_net_cost + 
    PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + s_start_year

                        Df Deviance     AIC
- factor(Citizenship)    7   58.210  98.814
- s_start_year           1   56.404 121.371
- PC3                    1   56.453 121.420
- s_start_age            1   56.672 121.640
- PC2                    1   56.753 121.720
- factor(Gender)         1   57.696 122.663
- PC1                    1   57.867 122.834
- GPA_trend              1   57.937 122.904
- avs_net_cost           1   58.312 123.279
- factor(gap_indicator)  1   60.177 125.144
<none>                       56.399 125.427

Step:  AIC=98.81
Y ~ s_start_age + factor(Gender) + avs_net_cost + PC1 + PC2 + 
    PC3 + factor(gap_indicator) + GPA_trend + s_start_year

                        Df Deviance     AIC
- s_start_age            1   58.227  94.771
- PC3                    1   58.418  94.962
- s_start_year           1   58.423

<b> Significant variables backward BIC selects:

In [84]:
round(summary(log_mod_m_bic)$coeff[which(summary(log_mod_m_bic)$coef[,4] <= .05),c(1,4)],5)
summary(log_mod_m_bic)

Unnamed: 0,Estimate,Pr(>|z|)
(Intercept),0.6351,0.03684
PC1,2.37914,0.0367



Call:
glm(formula = Y ~ PC1 + factor(gap_indicator), family = "binomial", 
    data = Management_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6918  -1.1190   0.7180   0.9222   1.9057  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)  
(Intercept)               0.6351     0.3042   2.088   0.0368 *
PC1                       2.3791     1.1389   2.089   0.0367 *
factor(gap_indicator)1   17.0024  1719.3142   0.010   0.9921  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 74.726  on 57  degrees of freedom
Residual deviance: 64.973  on 55  degrees of freedom
AIC: 70.973

Number of Fisher Scoring iterations: 16


# Nursing Analysis

<b> Logistic regression model:

In [85]:
log_mod_n = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                avs_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year,
                data=Nursing_df,family="binomial")
summary(log_mod_n)


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year, family = "binomial", data = Nursing_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2481  -0.9788   0.3349   0.9481   1.6152  

Coefficients:
                                                       Estimate Std. Error
(Intercept)                                             0.73247    0.50179
s_start_age                                            -0.21048    0.16624
factor(Gender)Male                                      0.26393    0.71830
factor(funding_indicator)1                              0.44967    0.50451
factor(Citizenship)Africa                              16.66184 2229.93005
factor(Citizenship)American Indian or Alaskan Native   17.11091 3956.18041
factor(Citizenship)Asia                                 0.17601    1.35169
factor(Citizenship)Asian        

<b> Significant Variables of logistic regression model: 

In [86]:
round(summary(log_mod_n)$coeff[which(summary(log_mod_n)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
PC3,-1.37417,0.04906
s_start_year,-0.80303,0.01041


<b> Backward AIC:

In [87]:
log_mod_n_aic= stepAIC(log_mod_n, direction = 'both')

Start:  AIC=160.73
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                            Df Deviance    AIC
- avs_net_cost               1   122.76 158.76
- GPA_trend                  1   122.81 158.81
- factor(Gender)             1   122.86 158.86
- factor(gap_indicator)      1   123.03 159.03
- factor(Citizenship)        8   137.48 159.48
- factor(funding_indicator)  1   123.53 159.53
- PC2                        1   124.28 160.28
- s_start_age                1   124.37 160.37
<none>                           122.72 160.72
- PC1                        1   126.04 162.04
- PC3                        1   126.66 162.66
- s_start_year               1   131.98 167.98

Step:  AIC=158.76
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

          

<b> Significant variables backward AIC selects:

In [88]:
round(summary(log_mod_n_aic)$coeff[which(summary(log_mod_n_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
(Intercept),1.03837,0.00821
PC3,-1.52811,0.01594
s_start_year,-0.79138,0.0093


<b> Backward BIC:

In [89]:
log_mod_n_bic = stepAIC(log_mod_n, direction = "both", k = log(d_n))

Start:  AIC=213.21
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                            Df Deviance    AIC
- factor(Citizenship)        8   137.48 189.86
- avs_net_cost               1   122.76 208.48
- GPA_trend                  1   122.81 208.53
- factor(Gender)             1   122.86 208.58
- factor(gap_indicator)      1   123.03 208.75
- factor(funding_indicator)  1   123.53 209.25
- PC2                        1   124.28 210.00
- s_start_age                1   124.37 210.09
- PC1                        1   126.04 211.76
- PC3                        1   126.66 212.38
<none>                           122.72 213.21
- s_start_year               1   131.98 217.70

Step:  AIC=189.86
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                 

<b> Significant variables backward BIC selects:

In [90]:
round(summary(log_mod_n_bic)$coeff[which(summary(log_mod_n_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
(Intercept),0.73862,0.00187
s_start_year,-0.62592,0.013


# Watson Analysis 

<b> Logistic regression model:

In [91]:
log_mod_w = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                avs_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year,
                data=Watson_df,family="binomial")
summary(log_mod_w)


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year, family = "binomial", data = Watson_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8514  -0.8201   0.5055   0.7883   1.8457  

Coefficients:
                                               Estimate Std. Error z value
(Intercept)                                    -1.02795    0.39812  -2.582
s_start_age                                    -0.06174    0.14322  -0.431
factor(Gender)Male                              0.33990    0.27624   1.230
factor(funding_indicator)1                      1.94403    0.30304   6.415
factor(Citizenship)Africa                      15.17541  534.23722   0.028
factor(Citizenship)Asia                         0.73593    0.37265   1.975
factor(Citizenship)Asian                       -0.06677    0.51754  -0.129
factor(Citizenship)Black or Afric

<b> Significant Variables of logistic regression model: 

In [92]:
round(summary(log_mod_w)$coeff[which(summary(log_mod_w)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
(Intercept),-1.02795,0.00982
factor(funding_indicator)1,1.94403,0.0
factor(Citizenship)Asia,0.73593,0.04828
avs_net_cost,0.36962,0.01367
PC3,-0.42057,0.01329
s_start_year,-0.99233,0.0


<b> Backward AIC:

In [93]:
log_mod_w_aic= stepAIC(log_mod_w, direction = 'both')

Start:  AIC=562.71
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                            Df Deviance    AIC
- factor(Citizenship)       11   535.84 557.84
- s_start_age                1   518.90 560.90
- factor(gap_indicator)      1   519.68 561.68
- PC2                        1   520.03 562.03
- factor(Gender)             1   520.21 562.21
<none>                           518.71 562.71
- GPA_trend                  1   521.03 563.03
- PC1                        1   521.49 563.49
- PC3                        1   525.95 567.95
- avs_net_cost               1   526.85 568.85
- s_start_year               1   561.04 603.04
- factor(funding_indicator)  1   564.43 606.43

Step:  AIC=557.84
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                 

<b> Significant variables backward AIC selects:

In [94]:
round(summary(log_mod_w_aic)$coeff[which(summary(log_mod_w_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
(Intercept),-0.54971,0.0076
factor(funding_indicator)1,2.0137,0.0
avs_net_cost,0.3613,0.0175
PC1,-0.78742,0.01415
PC3,-0.45321,0.00415
s_start_year,-0.99717,0.0


<b> Backward BIC:

In [95]:
log_mod_w_bic = stepAIC(log_mod_w, direction = "both", k = log(e_n))

Start:  AIC=656.09
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                            Df Deviance    AIC
- factor(Citizenship)       11   535.84 604.53
- s_start_age                1   518.90 650.03
- factor(gap_indicator)      1   519.68 650.81
- PC2                        1   520.03 651.15
- factor(Gender)             1   520.21 651.34
- GPA_trend                  1   521.03 652.15
- PC1                        1   521.49 652.61
<none>                           518.71 656.09
- PC3                        1   525.95 657.08
- avs_net_cost               1   526.85 657.97
- s_start_year               1   561.04 692.17
- factor(funding_indicator)  1   564.43 695.56

Step:  AIC=604.53
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    avs_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year

                 

<b> Significant variables backward BIC selects:

In [96]:
summary(log_mod_w_bic)


Call:
glm(formula = Y ~ factor(funding_indicator) + avs_net_cost + 
    s_start_year, family = "binomial", data = Watson_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8893  -0.9101   0.5567   0.8484   1.7719  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -0.5034     0.2001  -2.516   0.0119 *  
factor(funding_indicator)1   2.0988     0.2644   7.939 2.04e-15 ***
avs_net_cost                 0.3515     0.1548   2.270   0.0232 *  
s_start_year                -1.0004     0.1437  -6.963 3.34e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 656.07  on 514  degrees of freedom
Residual deviance: 551.63  on 511  degrees of freedom
AIC: 559.63

Number of Fisher Scoring iterations: 4


In [97]:
round(summary(log_mod_w_bic)$coeff[which(summary(log_mod_w_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
(Intercept),-0.50338,0.01188
factor(funding_indicator)1,2.09885,0.0
avs_net_cost,0.35147,0.0232
s_start_year,-1.0004,0.0


Joint Analysis