In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
data = pd.read_csv('../dataset_final.csv')

In [3]:
decision_to_binary = {'Accept (Poster)': 1, 'Accept (Oral)': 1, 'Accept (Talk)': 1, 'Accept (Spotlight)': 1, 'Invite to Workshop Track': 0, 'Withdrawn': 0, 'Reject': 0}

In [4]:
top_10_indicator = []
counts = [0] * 2
scores = []
y = []
for rankings, ratings, decision, year in zip(data['csranking'], data['ratings'], data['decisions'], data['year']):
    if pd.notnull(rankings) and pd.notnull(ratings) and year == 2020:
        # get mean reviewer score
        rates = ratings.split(';')
        rates = [int(x) for x in rates]
        rating_avg = np.average(rates)
        
        # get distinct rankings
        ranks = rankings.split(';')
        ranks = [int(x) for x in ranks]
        ranks_set = set(ranks)
        ranks_set.discard(-1)
        
        # get decision
        binary_decision = decision_to_binary.get(decision)
        
        # for each distinct rank
        for rank in ranks_set:
            scores.append(rating_avg)
            y.append(binary_decision)
            
            # compute top 10 indicator
            if 1 <= rank <= 10:
                top_10_indicator.append(1)
                counts[0] += 1
            else:
                top_10_indicator.append(0)
                counts[1] += 1


In [5]:
X = pd.DataFrame()
X['mean reviewer score'] = scores
X['top ten school?'] = top_10_indicator
X['constant'] = [1] * len(scores)
y = np.array(y)

In [6]:
print(X.shape)
print(y.shape)

(3139, 3)
(3139,)


In [7]:
# fit logistic regression model
logreg = sm.Logit(y, X)
result = logreg.fit()
print(result.summary())
# print counts of top 10 and not top 10 schools
print(counts, sum(counts))

Optimization terminated successfully.
         Current function value: 0.212936
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 3139
Model:                          Logit   Df Residuals:                     3136
Method:                           MLE   Df Model:                            2
Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6429
Time:                        02:13:37   Log-Likelihood:                -668.41
converged:                       True   LL-Null:                       -1871.6
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
mean reviewer score     2.4350      0.097     25.118      0.000       2.245       2.625
to

In [8]:
# compute accuracy on test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')
result = logreg.fit(X_train, y_train)

y_pred = result.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.92


In [9]:
counts = [0] * 10
chi_1 = []
chi_2 = []
chi_3 = []
chi_4 = []
chi_5 = []
chi_6 = []
chi_7 = []
chi_8 = []
chi_10 = []

scores = []
y = []
for rankings, ratings, decision, year in zip(data['csranking'], data['ratings'], data['decisions'], data['year']):
    if pd.notnull(rankings) and pd.notnull(ratings) and year == 2020:
        # get mean reviewer score
        rates = ratings.split(';')
        rates = [int(x) for x in rates]
        rating_avg = np.average(rates)
        
        # get distinct rankings
        ranks = rankings.split(';')
        ranks = [int(x) for x in ranks]
        ranks_set = set(ranks)
        ranks_set.discard(-1)

        # get decision
        binary_decision = decision_to_binary.get(decision)
        
        # for each distinct rank
        for rank in ranks_set:
            scores.append(rating_avg)
            y.append(binary_decision)
            # compute indicators for each of the top 10
            if rank == 1:
                chi_1.append(1)
                chi_2.append(0)
                chi_3.append(0)
                chi_4.append(0)
                chi_5.append(0)
                chi_6.append(0)
                chi_7.append(0)
                chi_8.append(0)
                chi_10.append(0)
                counts[0] += 1
            elif rank == 2:
                chi_1.append(0)
                chi_2.append(1)
                chi_3.append(0)
                chi_4.append(0)
                chi_5.append(0)
                chi_6.append(0)
                chi_7.append(0)
                chi_8.append(0)
                chi_10.append(0)
                counts[1] += 1
            elif rank == 3:
                chi_1.append(0)
                chi_2.append(0)
                chi_3.append(1)
                chi_4.append(0)
                chi_5.append(0)
                chi_6.append(0)
                chi_7.append(0)
                chi_8.append(0)
                chi_10.append(0)
                counts[2] += 1
            elif rank == 4:
                chi_1.append(0)
                chi_2.append(0)
                chi_3.append(0)
                chi_4.append(1)
                chi_5.append(0)
                chi_6.append(0)
                chi_7.append(0)
                chi_8.append(0)
                chi_10.append(0)
                counts[3] += 1
            elif rank == 5:
                chi_1.append(0)
                chi_2.append(0)
                chi_3.append(0)
                chi_4.append(0)
                chi_5.append(1)
                chi_6.append(0)
                chi_7.append(0)
                chi_8.append(0)
                chi_10.append(0)
                counts[4] += 1
            elif rank == 6:
                chi_1.append(0)
                chi_2.append(0)
                chi_3.append(0)
                chi_4.append(0)
                chi_5.append(0)
                chi_6.append(1)
                chi_7.append(0)
                chi_8.append(0)
                chi_10.append(0)
                counts[5] += 1
            elif rank == 7:
                chi_1.append(0)
                chi_2.append(0)
                chi_3.append(0)
                chi_4.append(0)
                chi_5.append(0)
                chi_6.append(0)
                chi_7.append(1)
                chi_8.append(0)
                chi_10.append(0)
                counts[6] += 1
            elif rank == 8:
                chi_1.append(0)
                chi_2.append(0)
                chi_3.append(0)
                chi_4.append(0)
                chi_5.append(0)
                chi_6.append(0)
                chi_7.append(0)
                chi_8.append(1)
                chi_10.append(0)
                counts[7] += 1
            elif rank == 10:
                chi_1.append(0)
                chi_2.append(0)
                chi_3.append(0)
                chi_4.append(0)
                chi_5.append(0)
                chi_6.append(0)
                chi_7.append(0)
                chi_8.append(0)
                chi_10.append(1)
                counts[8] += 1
            else:
                chi_1.append(0)
                chi_2.append(0)
                chi_3.append(0)
                chi_4.append(0)
                chi_5.append(0)
                chi_6.append(0)
                chi_7.append(0)
                chi_8.append(0)
                chi_10.append(0)
                counts[9] += 1

In [10]:
X = pd.DataFrame()
X['mean reviewer score'] = scores
X['rank 1'] = chi_1
X['rank 2'] = chi_2
X['rank 3'] = chi_3
X['rank 4'] = chi_4
X['rank 5'] = chi_5
X['rank 6'] = chi_6
X['rank 7'] = chi_7
X['rank 8 (tied)'] = chi_8
X['rank 10'] = chi_10
X['constant'] = [1] * len(scores)
y = np.array(y)

In [11]:
# fit logistic regression model
logreg = sm.Logit(y, X)
result = logreg.fit()
print(result.summary())
# print counts of each top 10 school and not top 10 schools
print(counts, sum(counts))

Optimization terminated successfully.
         Current function value: 0.211132
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 3139
Model:                          Logit   Df Residuals:                     3128
Method:                           MLE   Df Model:                           10
Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6459
Time:                        02:13:37   Log-Likelihood:                -662.74
converged:                       True   LL-Null:                       -1871.6
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
mean reviewer score     2.4600      0.099     24.949      0.000       2.267       2.653
ra

In [12]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')
result = logreg.fit(X_train, y_train)

y_pred = result.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.92


In [13]:
scores = []
google_ind = []
facebook_ind = []
microsoft_ind = []

y = []
for institution, ratings, decision, year in zip(data['institution'], data['ratings'], data['decisions'], data['year']):
    if pd.notnull(institution) and pd.notnull(ratings) and year == 2020:
        # get mean reviewer score
        rates = ratings.split(';')
        rates = [int(x) for x in rates]
        rating_avg = np.average(rates)
        
        # get distinct institutions
        institutions = institution.split(';')
        institution_set = set(institutions)
        
        # get decision
        binary_decision = decision_to_binary.get(decision)
        
        # for each distinct institution
        for inst in institution_set:
            scores.append(rating_avg)
            y.append(binary_decision)
            
            # compute Google, Facebook, and Microsoft indicators
            if inst == 'Google':
                google_ind.append(1)
                facebook_ind.append(0)
                microsoft_ind.append(0)
            elif inst == 'Facebook':
                google_ind.append(0)
                facebook_ind.append(1)
                microsoft_ind.append(0)
            elif inst == 'Microsoft':
                google_ind.append(0)
                facebook_ind.append(0)
                microsoft_ind.append(1)
            else:
                google_ind.append(0)
                facebook_ind.append(0)
                microsoft_ind.append(0)


In [14]:
X = pd.DataFrame()
X['mean reviewer score'] = scores
X['google'] = google_ind
X['facebook'] = facebook_ind
X['microsoft'] = microsoft_ind
X['constant'] = [1] * len(scores)

# count how many papers from each institution
print(np.sum(google_ind), np.sum(facebook_ind), np.sum(microsoft_ind))

303 110 92


In [15]:
# fit logistic regression model
logreg = sm.Logit(y, X)
result = logreg.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.216186
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 4876
Model:                          Logit   Df Residuals:                     4871
Method:                           MLE   Df Model:                            4
Date:                Fri, 02 Oct 2020   Pseudo R-squ.:                  0.6339
Time:                        02:13:37   Log-Likelihood:                -1054.1
converged:                       True   LL-Null:                       -2879.6
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
mean reviewer score     2.3515      0.075     31.557      0.000       2.205       2.498
go

In [16]:
# compute accuracy on test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(solver='newton-cg', fit_intercept = True, penalty='none')
result = logreg.fit(X_train, y_train)

y_pred = result.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.90
