In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
# input files
naba_filename = './output_files/naba_data_removed_duplicates.csv'
reg_filename = './output_files/reg_inputs_df.csv'
reg_sig_coef_filename = './output_files/significant_var_coeffs.csv'
topics_filename ='./output_files/norm_topic_count_df.csv'

p_alpha = 0.1

## Read in inputs file from regression analysis. 
we will only use the columns that showed statistical significance in our original logit regression analysis 

In [3]:
reg_df = pd.read_csv(reg_filename)
reg_df

Unnamed: 0,is_male,clasf_freshman,clasf_graduate student,clasf_junior,clasf_other,clasf_senior,clasf_sophomore,is_accounting_major,is_business_major,is_finance_major,...,is_internship_interested,is_internship_applied,is_job_accept,is_internship_accept,did_cpa_review,major_gpa,overall_gpa,len_of_extra_curr_entry,len_of_honors_entry,len_of_para_entry
0,1,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,-0.171818,-0.286653,-0.712752,-0.494198,0.835078
1,0,1,0,0,0,0,0,0,1,0,...,1,1,0,1,0,0.764447,-0.762381,1.883098,0.963549,-1.919349
2,0,1,0,0,0,0,0,1,0,0,...,1,1,0,0,0,-0.055400,-0.139157,-0.809452,-0.457995,-1.323172
3,0,0,0,0,0,0,1,1,0,0,...,0,1,0,1,0,-0.383338,0.068585,-0.727794,-0.433861,0.462275
4,1,0,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0.764447,0.629487,0.191927,-0.120107,0.976805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,0,1,0,0,0,0,0,1,0,...,0,1,0,1,1,0.108570,-0.049828,-0.207765,0.241916,0.941373
196,0,0,0,1,0,0,0,0,1,0,...,1,1,0,1,0,-0.039003,-0.243028,-0.278678,0.932173,0.184984
197,0,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0.124967,0.297101,-0.581670,-0.629353,0.514653
198,0,0,0,1,0,0,0,0,0,1,...,1,1,0,1,0,-0.334147,0.006262,2.486934,3.869388,0.679487


In [4]:
reg_sig_coef = pd.read_csv(reg_sig_coef_filename)['index']
reg_sig_coef

0    is_accounting_major
1       is_finance_major
2    is_business_major_2
3            is_cur_lead
4            overall_gpa
Name: index, dtype: object

In [5]:
reg_df = reg_df[reg_sig_coef]
reg_df

Unnamed: 0,is_accounting_major,is_finance_major,is_business_major_2,is_cur_lead,overall_gpa
0,1,0,1,1,-0.286653
1,0,0,0,1,-0.762381
2,1,0,0,0,-0.139157
3,1,0,0,0,0.068585
4,1,0,0,0,0.629487
...,...,...,...,...,...
195,0,0,0,1,-0.049828
196,0,0,0,1,-0.243028
197,1,0,1,1,0.297101
198,0,1,0,0,0.006262


## read in topics table 
Turn each topic column into binary variables (1 if topic was included; 0 if topic was not included)

In [6]:
topics_df = pd.read_csv(topics_filename)
topics_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_64,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73
0,0.263158,0.052632,0.000000,0.052632,0.052632,0.000000,0.052632,0.000000,0.052632,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.052632
1,0.416667,0.000000,0.000000,0.083333,0.083333,0.083333,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
2,0.363636,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
3,0.055556,0.000000,0.000000,0.000000,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.055556,0.000000,0.0,0.055556,0.000000
4,0.333333,0.166667,0.000000,0.066667,0.000000,0.033333,0.000000,0.033333,0.033333,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.033333,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.043478,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.043478,0.0,0.0,0.000000,0.0,0.000000,0.086957,0.0,0.000000,0.000000
196,0.181818,0.045455,0.000000,0.000000,0.000000,0.000000,0.045455,0.000000,0.045455,0.045455,...,0.000000,0.0,0.0,0.045455,0.0,0.000000,0.000000,0.0,0.000000,0.000000
197,0.173913,0.043478,0.000000,0.043478,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.043478,0.000000,0.0,0.000000,0.000000
198,0.321429,0.035714,0.071429,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000


In [7]:
has_topics_df = topics_df > 0
has_topics_df = has_topics_df.astype(int)
has_topics_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_64,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73
0,1,1,0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,1,1,0,1,0,1,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
196,1,1,0,0,0,0,1,0,1,1,...,0,0,0,1,0,0,0,0,0,0
197,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
198,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Load target column

In [8]:
target_col = 'Recipient'

In [9]:
target_df = pd.read_csv(naba_filename)[[target_col]]

In [10]:
target_df[target_col] = ['No' if type(x) != str else x for x in target_df[target_col]]
target_df[target_col] = target_df[target_col].str.replace('No', '0')
target_df[target_col] = target_df[target_col].str.replace('Yes', '1')
target_df = target_df.astype(int)
target_df.value_counts()

Recipient
1            110
0             90
dtype: int64

In [11]:
target_df

Unnamed: 0,Recipient
0,0
1,0
2,0
3,0
4,1
...,...
195,1
196,0
197,0
198,0


## make inputs DF

In [12]:
inputs_df = pd.concat(
    [reg_df,
     # topics_df,
     has_topics_df,
    ],
    axis=1,
)
inputs_df

Unnamed: 0,is_accounting_major,is_finance_major,is_business_major_2,is_cur_lead,overall_gpa,topic_0,topic_1,topic_2,topic_3,topic_4,...,topic_64,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73
0,1,0,1,1,-0.286653,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,-0.762381,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,-0.139157,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0.068585,1,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
4,1,0,0,0,0.629487,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,1,-0.049828,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
196,0,0,0,1,-0.243028,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
197,1,0,1,1,0.297101,1,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
198,0,1,0,0,0.006262,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Logit model 

In [13]:
X = sm.add_constant(inputs_df.astype(float))
logit_reg = sm.Logit(
    target_df.astype(float),
    X,
# ).fit(maxiter=1000, method='bfgs') # bfgs nm
).fit_regularized(maxiter=1000) # bfgs nm
logit_reg.summary()

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3870105137320152
            Iterations: 208
            Function evaluations: 208
            Gradient evaluations: 208


0,1,2,3
Dep. Variable:,Recipient,No. Observations:,200.0
Model:,Logit,Df Residuals:,120.0
Method:,MLE,Df Model:,79.0
Date:,"Mon, 13 Feb 2023",Pseudo R-squ.:,0.4376
Time:,01:04:31,Log-Likelihood:,-77.402
converged:,True,LL-Null:,-137.63
Covariance Type:,nonrobust,LLR p-value:,0.001852

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.5045,1.537,-2.932,0.003,-7.516,-1.493
is_accounting_major,3.5418,0.859,4.123,0.000,1.858,5.226
is_finance_major,3.9688,1.147,3.459,0.001,1.720,6.218
is_business_major_2,-2.0916,1.040,-2.011,0.044,-4.130,-0.053
is_cur_lead,3.3419,0.881,3.794,0.000,1.615,5.069
overall_gpa,0.3763,0.319,1.180,0.238,-0.249,1.001
topic_0,1.4038,1.428,0.983,0.326,-1.395,4.203
topic_1,-0.1944,0.619,-0.314,0.754,-1.408,1.020
topic_2,0.2351,0.657,0.358,0.720,-1.053,1.523


In [14]:
print(f"variables with statistical significance with alpha of {p_alpha} (p-values less than this):")
significant_var_coeffs = logit_reg.params[logit_reg.pvalues < p_alpha]
significant_var_coeffs

variables with statistical significance with alpha of 0.1 (p-values less than this):


const                 -4.504468
is_accounting_major    3.541789
is_finance_major       3.968811
is_business_major_2   -2.091610
is_cur_lead            3.341944
topic_13               2.817758
topic_15              -1.447707
topic_25               2.264021
topic_42               2.062340
topic_45               2.033636
topic_57              -2.739685
topic_58              -2.670800
dtype: float64

In [15]:
print(f"ODDS RATIOS of variables with statistical significance with alpha of {p_alpha} (p-values less than this):")
significant_var_odds_ratios = np.exp(significant_var_coeffs).sort_values()
significant_var_odds_ratios

ODDS RATIOS of variables with statistical significance with alpha of 0.1 (p-values less than this):


const                   0.011059
topic_57                0.064591
topic_58                0.069197
is_business_major_2     0.123488
topic_15                0.235109
topic_45                7.641824
topic_42                7.864351
topic_25                9.621697
topic_13               16.739279
is_cur_lead            28.274026
is_accounting_major    34.528647
is_finance_major       52.921589
dtype: float64

In [16]:
print("variables with negative effects and their odds ratios:")
significant_var_odds_ratios[significant_var_odds_ratios < 1 ]

variables with negative effects and their odds ratios:


const                  0.011059
topic_57               0.064591
topic_58               0.069197
is_business_major_2    0.123488
topic_15               0.235109
dtype: float64

In [17]:
print("variables with positive effects and their odds ratios:")
significant_var_odds_ratios[significant_var_odds_ratios > 1 ]

variables with positive effects and their odds ratios:


topic_45                7.641824
topic_42                7.864351
topic_25                9.621697
topic_13               16.739279
is_cur_lead            28.274026
is_accounting_major    34.528647
is_finance_major       52.921589
dtype: float64