In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
data = pd.read_csv('../dataset_final.csv')

In [3]:
# convert decision to binary classes accept, reject
decision_to_binary = {'Accept (Poster)': 1, 'Accept (Oral)': 1, 'Accept (Talk)': 1, 'Accept (Spotlight)': 1, 'Invite to Workshop Track': 0, 'Withdrawn': 0, 'Reject': 0}

In [4]:
# fit logistic regression model with either first or last author genders
def logistic_regression(author):
    if author:
        index=0
    else:
        index=-1
        
    scores = []
    gender_indicator = []
    
    y = []
    for ratings, decision, year, gender in zip(data['ratings'], data['decisions'], data['year'], data['genders']):
        if year != 2020:
            continue
        
        # get decision
        binary_decision = decision_to_binary.get(decision)
        
        # get genders, omit unlablled authors
        genders = gender.split(';')
        if genders[index] == '-1' or genders[index] == 'u':
            continue
        
        # compute gender indicator
        if genders[index] == 'm':
            gender_indicator.append(1)
        else:
            gender_indicator.append(0)
        
        # get mean reviewer score
        rates = ratings.split(';')
        rates = [int(x) for x in rates]
        rating_avg = np.average(rates)

        scores.append(rating_avg)
        y.append(binary_decision)


    X = pd.DataFrame()
    X['mean reviewer score'] = scores
    X['gender indicator'] = gender_indicator
    X['constant'] = [1] * len(scores)
    y = np.array(y)
    
    # fit logistic regression model
    logreg = sm.Logit(y, X)
    result = logreg.fit()
    summary = result.summary()
    
    # compute test accuracy
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')
    result = logreg.fit(X_train, y_train)
    
    # return summary and accuracy
    y_pred = result.predict(X_test)
    return summary, logreg.score(X_test, y_test)

In [5]:
print('First author\n')
summary_first, score_first = logistic_regression(True)
print(summary_first)
print('Accuracy: ', score_first)
print('Last author\n')
summary_last, score_last = logistic_regression(False)
print(summary_last)
print('Accuracy: ', score_last)

First author

Optimization terminated successfully.
         Current function value: 0.207938
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 2419
Model:                          Logit   Df Residuals:                     2416
Method:                           MLE   Df Model:                            2
Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6412
Time:                        02:13:51   Log-Likelihood:                -503.00
converged:                       True   LL-Null:                       -1401.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
mean reviewer score     2.3956      0.109     21.887      0.000       2.181 