# Load Data and Final Clean

In [3]:
df <- read.csv(file="data_020320_cleaned.csv",sep=",",header=TRUE)
df = df[,c(-1,-2)]
colnames(df)
dim(df)

Scale and center some variables:

In [4]:
mean_age = mean(df$Start_age)
sd_age = sd(df$Start_age)
df$s_start_age = (df$Start_age - mean_age)/sd_age
mean_net = mean(df$net_cost)
sd_net = sd(df$net_cost)
df$s_net_cost = (df$net_cost - mean_net)/sd_net
mean_year = mean(df$Start_year)
sd_year = sd(df$Start_year)
df$s_start_year = (df$Start_year - mean_year)/sd_year

## Seperate by School

Different Schools

In [5]:
schools = levels(factor(df$College))

In [6]:
Harpur_df = df[df$College %in% schools[1:4],]
Management_df = df[df$College=="GD Management",]
Nursing_df = df[df$College=="GD Nursing",]
Watson_df = df[df$College=="GD Watson",]

In [7]:
unique(Harpur_df$Major)

In [8]:
harpur_sciences <- c("Chemistry", "Behavioral Neuroscience","Cognitive Psychology","Physics","Mathematics",
                     "Biological Sciences","Materials Science and Engineer","Material Sci and Eng-Engineerg",
                     "Clinical Psychology","Economics","Geology","Political Science","Materials Science and Engr",
                     "Biological Sciences EEB", "X Geology","Mathematics/Economics")
harpur_arts <- c("Comparative Literature","Translation Studies","Educational Theory and Practic",
                 "Philosophy Interpretation and", "English","History","Anthropology","Sociology","Philosophy SPEL",
                 "Community and Public Affairs","Art History","Art History Combined MA PhD")

In [9]:
Harpur_sciences = Harpur_df[Harpur_df$Major %in% harpur_sciences,]
Harpur_arts = Harpur_df[Harpur_df$Major %in% harpur_arts,]

In [10]:
dim(Harpur_sciences)[1]
dim(Harpur_arts)[1]
dim(Harpur_df)[1]

In [11]:
a_n = dim(Harpur_sciences)[1]
a_1 = sum(Harpur_sciences$Y)
a_0 = a_n - a_1
b_n = dim(Harpur_arts)[1]
b_1 = sum(Harpur_arts$Y)
b_0 = b_n - b_1
c_n = dim(Management_df)[1]
c_1 = sum(Management_df$Y)
c_0 = c_n - c_1
d_n = dim(Nursing_df)[1]
d_1 = sum(Nursing_df$Y)
d_0 = d_n - d_1
e_n = dim(Watson_df)[1]
e_1 = sum(Watson_df$Y)
e_0 = e_n - e_1
School = c("Harpur sciences","Harpur_arts","SOM","Nursing","Watson")
sch_total = c(a_n,b_n,c_n,d_n,e_n)
succ_total = c(a_1,b_1,c_1,d_1,e_1)
unsucc_total = c(a_0,b_0,c_0,d_0,e_0)
data.frame("School"=School,"Total"=sch_total,"Successful Graduates"=succ_total,"Unsuccessful Graduates"=unsucc_total)

School,Total,Successful.Graduates,Unsuccessful.Graduates
Harpur sciences,861,542,319
Harpur_arts,918,534,384
SOM,58,38,20
Nursing,117,72,45
Watson,515,343,172


# Harpur Sciences Analysis

Won't use Degree column since it does not accurately show the student's major

<b> Logistic regression model:

In [10]:
log_mod_h_s = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                s_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year-1,
                data=Harpur_sciences,family="binomial")
summary(log_mod_h_s)


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1, family = "binomial", data = Harpur_sciences)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.1582  -0.9314   0.5094   0.8598   1.9400  

Coefficients:
                                                      Estimate Std. Error
s_start_age                                            0.13758    0.14937
factor(Gender)Female                                   0.51166    0.52948
factor(Gender)Male                                     0.52161    0.51853
factor(funding_indicator)1                             1.29069    0.26219
factor(Citizenship)American Indian or Alaskan Native -14.79014  482.99092
factor(Citizenship)Asia                               -0.83427    0.55907
factor(Citizenship)Asian                              -1.36366    0.61937
factor(Citizenship)Black or Afric

<b> Significant Variables of logistic regression model: 

In [11]:
round(summary(log_mod_h_s)$coeff[which(summary(log_mod_h_s)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,1.29069,0.0
factor(Citizenship)Asian,-1.36366,0.02769
factor(Citizenship)Hispanic or Latino,-1.80981,0.00302
factor(Citizenship)White,-0.97666,0.04457
s_net_cost,0.29634,0.00333
factor(gap_indicator)1,1.22096,0.00477
GPA_trend,2.45795,0.00614
s_start_year,-1.17468,0.0


Things we need
-  Run AIC/BIC stepwise model selection for Harpur and summarize results
-  Repeat for the other schools

<b> Backward AIC:

In [12]:
library('MASS')
log_mod_h_s_aic = stepAIC(log_mod_h_s, direction = 'backward')

Start:  AIC=979.74
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1

                            Df Deviance     AIC
- factor(Gender)             2   934.80  976.80
- PC3                        1   933.74  977.74
- PC1                        1   934.06  978.06
- PC2                        1   934.46  978.46
- s_start_age                1   934.60  978.60
- factor(Citizenship)       12   957.00  979.00
<none>                           933.74  979.74
- GPA_trend                  1   941.56  985.56
- s_net_cost                 1   943.10  987.10
- factor(gap_indicator)      1   943.67  987.67
- factor(funding_indicator)  1   959.22 1003.22
- s_start_year               1   969.05 1013.05

Step:  AIC=977.74
Y ~ s_start_age + factor(funding_indicator) + factor(Citizenship) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year 

<b> Significant variables backward AIC selects:

In [13]:
round(summary(log_mod_h_s_aic)$coeff[which(summary(log_mod_h_s_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,0.91722,0.0
s_net_cost,0.28526,0.00397
factor(gap_indicator)1,1.1598,0.00694
GPA_trend,2.31273,0.00687
s_start_year,-1.0687,0.0


<b> Backward BIC

In [14]:
log_mod_h_s_bic = stepAIC(log_mod_h_s, direction = "backward", k = log(a_n))

Start:  AIC=1089.18
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1

                            Df Deviance    AIC
- factor(Citizenship)       12   957.00 1031.3
- factor(Gender)             2   934.80 1076.7
- PC3                        1   933.74 1082.4
- PC1                        1   934.06 1082.7
- PC2                        1   934.46 1083.1
- s_start_age                1   934.60 1083.3
<none>                           933.74 1089.2
- GPA_trend                  1   941.56 1090.2
- s_net_cost                 1   943.10 1091.8
- factor(gap_indicator)      1   943.67 1092.3
- factor(funding_indicator)  1   959.22 1107.9
- s_start_year               1   969.05 1117.7

Step:  AIC=1031.34
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1

           

<b> Significant variables backward BIC selects:

In [15]:
round(summary(log_mod_h_s_bic)$coeff[which(summary(log_mod_h_s_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,0.8862,0.0
s_net_cost,0.29061,0.00351
factor(gap_indicator)1,1.15369,0.00704
GPA_trend,2.21363,0.00879
s_start_year,-1.25942,0.0


# Harpur Arts Analysis

<b> Logistic regression model:

In [16]:
log_mod_h_a = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                s_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year-1,
                data=Harpur_arts,family="binomial")
summary(log_mod_h_a)


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1, family = "binomial", data = Harpur_arts)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9287  -1.1447   0.7362   0.9895   2.0591  

Coefficients:
                                                       Estimate Std. Error
s_start_age                                             0.03273    0.07210
factor(Gender)Female                                   -0.94238    0.55155
factor(Gender)Male                                     -0.91838    0.54670
factor(funding_indicator)1                              0.76044    0.17817
factor(Citizenship)American Indian or Alaskan Native    0.40604    1.02417
factor(Citizenship)Asia                                 1.25696    0.56498
factor(Citizenship)Asian                                0.13206    0.64749
factor(Citizenship)Black or A

<b> Significant Variables of logistic regression model: 

In [17]:
round(summary(log_mod_h_a)$coeff[which(summary(log_mod_h_a)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,0.76044,2e-05
factor(Citizenship)Asia,1.25696,0.0261
PC1,-0.08566,0.04396
PC2,-0.29867,0.0
s_start_year,-0.24984,0.00849


<b> Backward AIC:

In [18]:
log_mod_h_a_aic = stepAIC(log_mod_h_a, direction = 'backward')

Start:  AIC=1188.13
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1

                            Df Deviance    AIC
- factor(Citizenship)       13   1157.0 1179.0
- s_start_age                1   1140.3 1186.3
- factor(Gender)             2   1143.0 1187.0
- s_net_cost                 1   1141.2 1187.2
<none>                           1140.1 1188.1
- factor(gap_indicator)      1   1142.6 1188.6
- GPA_trend                  1   1143.0 1189.0
- PC3                        1   1143.2 1189.2
- PC1                        1   1144.3 1190.3
- s_start_year               1   1147.4 1193.4
- factor(funding_indicator)  1   1159.0 1205.0
- PC2                        1   1163.8 1209.8

Step:  AIC=1178.96
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1

           

<b> Significant variables backward AIC selects:

In [19]:
round(summary(log_mod_h_a_aic)$coeff[which(summary(log_mod_h_a_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(Gender)Female,-0.28303,0.04563
factor(funding_indicator)1,0.75368,1e-05
PC2,-0.30383,0.0
s_start_year,-0.17974,0.03772


<b> Backward BIC

In [20]:
log_mod_h_a_bic = stepAIC(log_mod_h_a, direction = "backward", k = log(b_n))

Start:  AIC=1303.86
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1

                            Df Deviance    AIC
- factor(Citizenship)       13   1157.0 1232.0
- factor(Gender)             2   1143.0 1293.1
- s_start_age                1   1140.3 1297.2
- s_net_cost                 1   1141.2 1298.1
- factor(gap_indicator)      1   1142.6 1299.5
- GPA_trend                  1   1143.0 1299.9
- PC3                        1   1143.2 1300.1
- PC1                        1   1144.3 1301.2
<none>                           1140.1 1303.9
- s_start_year               1   1147.4 1304.3
- factor(funding_indicator)  1   1159.0 1315.9
- PC2                        1   1163.8 1320.8

Step:  AIC=1232
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1

              

<b> Significant variables backward BIC selects:

In [21]:
round(summary(log_mod_h_a_bic)$coeff[which(summary(log_mod_h_a_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,0.53811,0.0
PC2,-0.27725,0.0
s_start_year,-0.25793,0.00175


# Management Analysis

<b> Logistic regression model:

In [22]:
log_mod_m = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                s_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year-1,
                data=Management_df,family="binomial")
summary(log_mod_m)

"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1, family = "binomial", data = Management_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.97258  -0.13975   0.00016   0.21655   1.73673  

Coefficients:
                                        Estimate Std. Error z value Pr(>|z|)  
s_start_age                           -2.200e+00  1.290e+00  -1.705   0.0881 .
factor(Gender)Female                   2.892e+01  2.888e+03   0.010   0.9920  
factor(Gender)Male                     2.633e+01  2.888e+03   0.009   0.9927  
factor(funding_indicator)1            -2.848e+01  2.888e+03  -0.010   0.9921  
factor(Citizenship)Asian               5.399e+00  3.326e+00   1.623   0.1046  
factor(Citizenship)China               2.143e+00  2.804e+00   0.764   0.4447  
factor(Citizenship)Europe              1.073e+01  1.075e+04   0.

<b> Significant Variables of logistic regression model: 

In [23]:
round(summary(log_mod_m)$coeff[which(summary(log_mod_m)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
s_net_cost,8.76286,0.01023
GPA_trend,50.26204,0.04094


<b> Backward AIC:

In [24]:
log_mod_m_aic= stepAIC(log_mod_m, direction = 'backward')

Start:  AIC=65.07
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
- factor(Citizenship)        7   35.391 57.391
- s_start_year               1   29.081 63.081
- PC2                        1   30.623 64.623
- factor(gap_indicator)      1   30.734 64.734
<none>                           29.074 65.074
- PC1                        1   31.713 65.713
- s_start_age                1   32.846 66.846
- PC3                        1   33.259 67.259
- factor(Gender)             2   38.330 70.330
- GPA_trend                  1   40.427 74.427
- factor(funding_indicator)  1   40.837 74.837
- s_net_cost                 1   46.369 80.369


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Step:  AIC=57.39
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
- s_start_year               1   35.397 55.397
- factor(gap_indicator)      1   36.395 56.395
- PC2                        1   37.080 57.080
- PC3                        1   37.109 57.109
<none>                           35.391 57.391
- PC1                        1   37.993 57.993
- s_start_age                1   42.009 62.009
- factor(funding_indicator)  1   46.097 66.097
- factor(Gender)             2   48.703 66.703
- GPA_trend                  1   49.147 69.147
- s_net_cost                 1   52.687 72.687


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Step:  AIC=55.4
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend - 
    1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
- factor(gap_indicator)      1   36.433 54.433
<none>                           35.397 55.397
- PC2                        1   37.556 55.556
- PC3                        1   38.115 56.115
- PC1                        1   38.140 56.140
- s_start_age                1   42.118 60.118
- factor(funding_indicator)  1   46.121 64.121
- factor(Gender)             2   48.722 64.722
- GPA_trend                  1   49.330 67.330
- s_net_cost                 1   52.864 70.864


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Step:  AIC=54.43
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + GPA_trend - 1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
<none>                           36.433 54.433
- PC3                        1   38.833 54.833
- PC2                        1   39.013 55.013
- PC1                        1   40.989 56.989
- s_start_age                1   42.959 58.959
- factor(funding_indicator)  1   47.193 63.193
- factor(Gender)             2   49.426 63.426
- GPA_trend                  1   50.488 66.488
- s_net_cost                 1   55.147 71.147


<b> Significant variables backward AIC selects:

In [25]:
round(summary(log_mod_m_aic)$coeff[which(summary(log_mod_m_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
s_start_age,-2.37977,0.02596
s_net_cost,6.56536,0.00288
GPA_trend,50.00312,0.01562


<b> Backward BIC:

In [26]:
log_mod_m_bic = stepAIC(log_mod_m, direction = "backward", k = log(c_n))

Start:  AIC=102.16
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance     AIC
- factor(Citizenship)        7   35.391  80.056
- s_start_year               1   29.081  98.109
- PC2                        1   30.623  99.650
- factor(gap_indicator)      1   30.734  99.761
- PC1                        1   31.713 100.741
- s_start_age                1   32.846 101.874
<none>                           29.074 102.162
- PC3                        1   33.259 102.287
- factor(Gender)             2   38.330 103.297
- GPA_trend                  1   40.427 109.454
- factor(funding_indicator)  1   40.837 109.864
- s_net_cost                 1   46.369 115.397


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Step:  AIC=80.06
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
- s_start_year               1   35.397 76.001
- factor(gap_indicator)      1   36.395 76.999
- PC2                        1   37.080 77.685
- PC3                        1   37.109 77.713
- PC1                        1   37.993 78.597
<none>                           35.391 80.056
- s_start_age                1   42.009 82.613
- factor(Gender)             2   48.703 85.247
- factor(funding_indicator)  1   46.097 86.701
- GPA_trend                  1   49.147 89.751
- s_net_cost                 1   52.687 93.291


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Step:  AIC=76
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend - 
    1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
- factor(gap_indicator)      1   36.433 72.977
- PC2                        1   37.556 74.100
- PC3                        1   38.115 74.659
- PC1                        1   38.140 74.684
<none>                           35.397 76.001
- s_start_age                1   42.118 78.662
- factor(Gender)             2   48.722 81.206
- factor(funding_indicator)  1   46.121 82.665
- GPA_trend                  1   49.330 85.874
- s_net_cost                 1   52.864 89.408


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Step:  AIC=72.98
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + GPA_trend - 1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
- PC3                        1   38.833 71.316
- PC2                        1   39.013 71.496
<none>                           36.433 72.977
- PC1                        1   40.989 73.472
- s_start_age                1   42.959 75.442
- factor(Gender)             2   49.426 77.849
- factor(funding_indicator)  1   47.193 79.677
- GPA_trend                  1   50.488 82.972
- s_net_cost                 1   55.147 87.631


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Step:  AIC=71.32
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + GPA_trend - 1

                            Df Deviance    AIC
<none>                           38.833 71.316
- s_start_age                1   43.417 71.840
- PC1                        1   43.636 72.059
- PC2                        1   45.226 73.649
- factor(Gender)             2   49.606 73.968
- factor(funding_indicator)  1   47.351 75.774
- GPA_trend                  1   51.659 80.082
- s_net_cost                 1   55.159 83.582


<b> Significant variables backward BIC selects:

In [27]:
round(summary(log_mod_m_bic)$coeff[which(summary(log_mod_m_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
s_start_age,-1.77264,0.0496
s_net_cost,5.71519,0.00407
GPA_trend,48.683,0.0228


# Nursing Analysis

<b> Logistic regression model:

In [28]:
log_mod_n = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                s_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year-1,
                data=Nursing_df,family="binomial")
summary(log_mod_n)

"glm.fit: fitted probabilities numerically 0 or 1 occurred"


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1, family = "binomial", data = Nursing_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-3.04780  -0.85622   0.00024   0.83374   1.92061  

Coefficients:
                                                       Estimate Std. Error
s_start_age                                            -0.09046    0.16905
factor(Gender)Female                                   26.47385 3009.32343
factor(Gender)Male                                     26.17706 3009.32353
factor(funding_indicator)1                              0.77107    0.53549
factor(Citizenship)American Indian or Alaskan Native    2.19689 7147.58757
factor(Citizenship)Asia                               -18.59713 2922.87456
factor(Citizenship)Asian                              -18.29873 2922.87423
factor(Citizenship)B

<b> Significant Variables of logistic regression model: 

In [29]:
round(summary(log_mod_n)$coeff[which(summary(log_mod_n)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
s_net_cost,1.14042,0.00497
s_start_year,-0.90674,0.03001


<b> Backward AIC:

In [30]:
log_mod_n_aic= stepAIC(log_mod_n, direction = 'backward')

Start:  AIC=145.9
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
- factor(Citizenship)        8   120.73 142.73
- PC1                        1   107.90 143.90
- s_start_age                1   108.19 144.19
<none>                           107.90 145.90
- factor(funding_indicator)  1   110.04 146.04
- PC3                        1   110.59 146.59
- GPA_trend                  1   110.62 146.62
- factor(gap_indicator)      1   110.99 146.99
- s_start_year               1   113.65 149.65
- factor(Gender)             2   117.12 151.12
- PC2                        1   117.22 153.22
- s_net_cost                 1   118.39 154.39

Step:  AIC=142.73
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1

                            Df Deviance    AIC
- PC1                        1   120.73 140.73
- s_start_age                1   120.74 140.74
- GPA_trend                  1   121.80 141.80
- factor(gap_indicator) 

<b> Significant variables backward AIC selects:

In [31]:
round(summary(log_mod_m_aic)$coeff[which(summary(log_mod_m_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
s_start_age,-2.37977,0.02596
s_net_cost,6.56536,0.00288
GPA_trend,50.00312,0.01562


<b> Backward BIC:

In [32]:
log_mod_n_bic = stepAIC(log_mod_n, direction = "backward", k = log(d_n))

Start:  AIC=198.38
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1



"glm.fit: fitted probabilities numerically 0 or 1 occurred"

                            Df Deviance    AIC
- factor(Citizenship)        8   120.73 173.12
- PC1                        1   107.90 193.62
- s_start_age                1   108.19 193.91
- factor(funding_indicator)  1   110.04 195.76
- PC3                        1   110.59 196.31
- GPA_trend                  1   110.62 196.34
- factor(gap_indicator)      1   110.99 196.71
- factor(Gender)             2   117.12 198.08
<none>                           107.90 198.38
- s_start_year               1   113.65 199.37
- PC2                        1   117.22 202.94
- s_net_cost                 1   118.39 204.11

Step:  AIC=173.12
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1

                            Df Deviance    AIC
- PC1                        1   120.73 168.35
- s_start_age                1   120.74 168.36
- GPA_trend                  1   121.80 169.42
- factor(gap_indicator) 

<b> Significant variables backward BIC selects:

In [33]:
round(summary(log_mod_m_bic)$coeff[which(summary(log_mod_m_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
s_start_age,-1.77264,0.0496
s_net_cost,5.71519,0.00407
GPA_trend,48.683,0.0228


# Watson Analysis 

<b> Logistic regression model:

In [34]:
log_mod_w = glm(Y~s_start_age+factor(Gender)+factor(funding_indicator)+factor(Citizenship)+
                s_net_cost+PC1+PC2+PC3+factor(gap_indicator)+GPA_trend+s_start_year-1,
                data=Watson_df,family="binomial")
summary(log_mod_w)


Call:
glm(formula = Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1, family = "binomial", data = Watson_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7230  -0.7681   0.4800   0.7813   1.8460  

Coefficients:
                                               Estimate Std. Error z value
s_start_age                                    -0.02984    0.14456  -0.206
factor(Gender)Female                           14.36937  534.16922   0.027
factor(Gender)Male                             14.66739  534.16915   0.027
factor(funding_indicator)1                      1.98035    0.30776   6.435
factor(Citizenship)Asia                       -14.37935  534.16919  -0.027
factor(Citizenship)Asian                      -15.20281  534.16937  -0.028
factor(Citizenship)Black or African American  -15.42895  534.16957  -0.029
factor(Citizenship)China       

<b> Significant Variables of logistic regression model: 

In [35]:
round(summary(log_mod_w)$coeff[which(summary(log_mod_w)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,1.98035,0.0
s_net_cost,0.89759,9e-05
PC1,-0.15045,0.02514
factor(gap_indicator)1,0.81141,0.03466
s_start_year,-0.81771,7e-05


<b> Backward AIC:

In [36]:
log_mod_w_aic= stepAIC(log_mod_w, direction = 'backward')

Start:  AIC=554.22
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1

                            Df Deviance    AIC
- factor(Citizenship)       11   529.29 551.29
- PC2                        1   510.23 552.23
- s_start_age                1   510.26 552.26
- PC3                        1   510.78 552.78
- GPA_trend                  1   511.57 553.57
<none>                           510.22 554.22
- factor(Gender)             2   515.08 555.08
- factor(gap_indicator)      1   515.05 557.05
- PC1                        1   515.33 557.33
- s_net_cost                 1   528.20 570.20
- s_start_year               1   528.53 570.53
- factor(funding_indicator)  1   557.05 599.05

Step:  AIC=551.29
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1

             

<b> Significant variables backward AIC selects:

In [37]:
round(summary(log_mod_w_aic)$coeff[which(summary(log_mod_w_aic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,1.74553,0.0
s_net_cost,0.94484,3e-05
PC1,-0.12519,0.04771
s_start_year,-0.76643,0.0


<b> Backward BIC:

In [38]:
log_mod_w_bic = stepAIC(log_mod_w, direction = "backward", k = log(e_n))

Start:  AIC=647.59
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    factor(Citizenship) + s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + 
    GPA_trend + s_start_year - 1

                            Df Deviance    AIC
- factor(Citizenship)       11   529.29 597.97
- factor(Gender)             2   515.08 639.96
- PC2                        1   510.23 641.36
- s_start_age                1   510.26 641.39
- PC3                        1   510.78 641.90
- GPA_trend                  1   511.57 642.70
- factor(gap_indicator)      1   515.05 646.18
- PC1                        1   515.33 646.45
<none>                           510.22 647.59
- s_net_cost                 1   528.20 659.32
- s_start_year               1   528.53 659.66
- factor(funding_indicator)  1   557.05 688.18

Step:  AIC=597.97
Y ~ s_start_age + factor(Gender) + factor(funding_indicator) + 
    s_net_cost + PC1 + PC2 + PC3 + factor(gap_indicator) + GPA_trend + 
    s_start_year - 1

             

<b> Significant variables backward BIC selects:

In [39]:
round(summary(log_mod_w_bic)$coeff[which(summary(log_mod_w_bic)$coef[,4] <= .05),c(1,4)],5)

Unnamed: 0,Estimate,Pr(>|z|)
factor(funding_indicator)1,1.83449,0.0
s_net_cost,0.9213,5e-05
s_start_year,-0.93994,0.0
