In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
# load dataset with arXiv submission data
data = pd.read_csv('../dataset_arxiv.csv')

In [3]:
# convert decision to binary classes accept, reject
decision_to_binary = {'Accept (Poster)': 1, 'Accept (Oral)': 1, 'Accept (Talk)': 1, 'Accept (Spotlight)': 1, 'Invite to Workshop Track': 0, 'Withdrawn': 0, 'Reject': 0, 'accept': 1, 'reject': 0}

In [4]:
scores = []
arXiv = []

cnt = 0
tot = 0

y = []
for ratings, rankings, decision, year, days, versions in zip(data['ratings'], data['csranking'], data['decisions'], data['year'], data['days'], data['versions']):
    if year == 2020 and pd.notnull(rankings):
        # get decision
        binary_decision = decision_to_binary.get(decision)
        
        # get distinct ranks on the paper
        ranks = rankings.split(';')
        ranks = [int(x) for x in ranks]
        ranks_set = set(ranks)
        ranks_set.discard(-1)
     
        # get mean reviewer score
        rates = ratings.split(';')
        rates = [int(x) for x in rates]
        rating_avg = np.average(rates)
        
        # for each distinct rank
        for rank in ranks_set:

            # uncomment for which subset of ranks to study
            
            # CMU MIT Cornell
            if rank != 1 and rank != 2 and rank != 7:
                continue
#             # not top 10
#             if 1 <= rank <= 10:
#                 continue
#             # top 10
#             if rank > 10:
#                 continue
#             # top 10 excluding CMU MIT Cornell
#             if rank > 10 or rank == 1 or rank == 2 or rank == 7:
#                 continue
            
            # not on arxiv
            if pd.isnull(versions):
                arXiv.append(0)
            
            # if on arxiv, check if visible during the review process
            else:
                # count total on arxiv and how many were visible
                if days >= -7:
                    arXiv.append(1)
                    cnt += 1
                else:
                    arXiv.append(0)    
                tot += 1
                
            scores.append(rating_avg)
            y.append(binary_decision)

In [5]:
X = pd.DataFrame()
X['mean reviewer score'] = scores
X['arXiv'] = arXiv
X['constant'] = [1] * len(scores)
y = np.array(y)

In [6]:
print(cnt, tot, cnt/tot, len(scores))

82 161 0.5093167701863354 259


In [7]:
# fit logistic regression model
logreg = sm.Logit(y, X)
result = logreg.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.203377
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  259
Model:                          Logit   Df Residuals:                      256
Method:                           MLE   Df Model:                            2
Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6934
Time:                        02:14:03   Log-Likelihood:                -52.675
converged:                       True   LL-Null:                       -171.79
Covariance Type:            nonrobust   LLR p-value:                 1.865e-52
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
mean reviewer score     2.7144      0.372      7.295      0.000       1.985       3.444
ar

In [8]:
# compute test accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')
result = logreg.fit(X_train, y_train)

y_pred = result.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.94


In [9]:
scores = []
visible = []
arXiv = []

cnt = 0
tot = 0

y = []
for ratings, institution, decision, year, days, versions in zip(data['ratings'], data['institution'], data['decisions'], data['year'], data['days'], data['versions']):
    if year == 2020 and pd.notnull(rankings) and pd.notnull(institution):
        # get decision
        binary_decision = decision_to_binary.get(decision)
        
        # get institutions
        institutions = institution.split(';')
        institution_set = set(institutions)
        
        # get mean reviewer score
        rates = ratings.split(';')
        rates = [int(x) for x in rates]
        rating_avg = np.average(rates)
        
        # for each distinct institution
        for inst in institution_set:
            # uncomment which subset to study
            
#             if inst != 'Facebook':
#                 continue
#             if inst != 'Microsoft':
#                 continue
            if inst != 'Google':
                continue
            
            # not on arXiv
            if pd.isnull(versions):
                arXiv.append(0)
                
            # if on arxiv, check if visible during the review process
            else:
                if days >= -7:
                    arXiv.append(1)
                    cnt += 1
                else:
                    arXiv.append(0)    
                tot += 1
                
            scores.append(rating_avg)
            y.append(binary_decision)


In [10]:
X = pd.DataFrame()
X['mean reviewer score'] = scores
X['arXiv'] = arXiv
X['constant'] = [1] * len(scores)
y = np.array(y)

In [11]:
print(cnt, tot, cnt/tot)

105 217 0.4838709677419355


In [12]:
# fit logistic regression model
logreg = sm.Logit(y, X)
result = logreg.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.258031
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  303
Model:                          Logit   Df Residuals:                      300
Method:                           MLE   Df Model:                            2
Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6172
Time:                        02:14:03   Log-Likelihood:                -78.183
converged:                       True   LL-Null:                       -204.24
Covariance Type:            nonrobust   LLR p-value:                 1.792e-55
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
mean reviewer score     2.4563      0.295      8.337      0.000       1.879       3.034
ar

In [13]:
# compute test accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')
result = logreg.fit(X_train, y_train)

y_pred = result.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.91
