In [38]:
import os
import pandas as pd
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import copy 
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv("../feature_extraction/neurips.csv")

df = df.sample(frac=1)

In [6]:
def normalize(train, test):
    mean = np.mean(train,axis=0)
    std = np.std(train,axis=0)

    return (train-mean)/std,(test-mean)/std

In [7]:
x = copy.deepcopy(df)[["num_equations","mean_num_new_symbols_introduced","num_overall_unique_symbols","mean_num_unique_symbols","std_of_unique_symbols","max_representational_complexity"]]
y = copy.deepcopy(df).recommendation_avg

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
                                                    random_state=0
                                                    )

X_train,X_test = normalize(X_train, X_test)

linreg = LinearRegression()
linreg.fit(X_train, y_train)

def norm(x,y):
    return np.mean((np.asarray(x)-np.asarray(y))**2)

print("Train error Linear Regression: {0}".format(norm(linreg.predict(X_train),y_train)))
print("Test error Linear Regression: {0}".format(norm(linreg.predict(X_test),y_test)))
print("Test error guess Avg: {0}".format(norm(np.mean(y_train),y_test)))
print("-------------------------------------------------------------------------")
cdf = pd.DataFrame(linreg.coef_, X_train.columns, columns=['Coefficients'])
print(cdf)

Train error Linear Regression: 0.46377212434568804
Test error Linear Regression: 0.42979916025462245
Test error guess Avg: 0.42989390445237585
-------------------------------------------------------------------------
                                 Coefficients
num_equations                        0.117641
mean_num_new_symbols_introduced      0.013894
num_overall_unique_symbols          -0.021387
mean_num_unique_symbols             -0.031399
std_of_unique_symbols                0.007026
max_representational_complexity     -0.042676


In [44]:

x = copy.deepcopy(df)[["num_equations","mean_num_new_symbols_introduced","num_overall_unique_symbols","mean_num_unique_symbols","std_of_unique_symbols","max_representational_complexity"]]
y = copy.deepcopy(df).status
y = np.asarray(y)
y[y == "Reject"] = 0
y[y == "Poster"] = 1
y[y == "Spotlight"] = 2
y[y == "Oral"] = 3


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
                                                    random_state=0
                                                    )

X_train,X_test = normalize(X_train, X_test)

y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

classifier = EasyEnsembleClassifier(random_state=0)
classifier.fit(X_train, y_train)
#logreg = LogisticRegression(max_iter=50000,solver="saga")
#logreg.fit(X_resampled, y_resampled)

def acc(x,y):
    return np.mean((np.asarray(x) == np.asarray(y)).astype(np.int32))

clf = pd.DataFrame({"Set" : ["Train", "Test"],
                    "Logistic Regression" : [acc(classifier.predict(X_train),y_train),acc(classifier.predict(X_test),y_test)],
                    "Random guess" : [acc(np.random.choice(y_train,size=y_train.size),y_train),
                                      acc(np.random.choice(y_train,size=y_test.size),y_test)],
                    "Predict class Reject" : [acc(0,y_train),acc(0,y_test)],
                    "Predict class Poster" : [acc(1,y_train),acc(1,y_test)],
                    "Predict class Spotlight" : [acc(2,y_train),acc(2,y_test)],
                    "Predict class Oral" : [acc(3,y_train),acc(3,y_test)],
        
                    "Random guess Reject or Poster" : [acc(np.random.choice(np.delete(y_train,(y_train == 2) | (y_train == 3)),size=y_train.size),y_train),
                                                       acc(np.random.choice(np.delete(y_train,(y_train == 2) | (y_train == 3)),size=y_test.size),y_test)],})
print("Accuracies: ")
print(clf)
print("-------------------------------------------------------------------------")
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)
clf = pd.DataFrame({"Set" : ["Train set", "Train prediction", "Test set", "Test prediction"],
                    "Reject" : [np.sum(y_train == 0),np.sum(y_train_pred == 0),np.sum(y_test == 0),np.sum(y_test_pred == 0)],
                    "Poster" : [np.sum(y_train == 1),np.sum(y_train_pred == 1),np.sum(y_test == 1),np.sum(y_test_pred == 1)],
                    "Spotlight" : [np.sum(y_train == 2),np.sum(y_train_pred == 2),np.sum(y_test == 2),np.sum(y_test_pred == 2)],
                    "Oral" : [np.sum(y_train == 3),np.sum(y_train_pred == 3),np.sum(y_test == 3),np.sum(y_test_pred == 3)],})

print("Prediction Distribution: ")
print(clf)
print("-------------------------------------------------------------------------")

"""
print("Coefficients: ")
clf = pd.DataFrame(columns=["equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"],data=classifier.coef_)
clf["status"] = ["Reject","Poster","Spotlight","Oral"]
clf = clf[["status","equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"]]
pd.set_option('display.width',1000)
print(clf)
"""


Accuracies: 
     Set  Logistic Regression  Random guess  Predict class Reject  Predict class Poster  Predict class Spotlight  Predict class Oral  Random guess Reject or Poster
0  Train             0.324652      0.695106              0.046700              0.831612                 0.101931            0.019758                       0.783116
1   Test             0.274346      0.687958              0.060733              0.808377                 0.112042            0.018848                       0.772775
-------------------------------------------------------------------------
Prediction Distribution: 
                Set  Reject  Poster  Spotlight  Oral
0         Train set     104    1852        227    44
1  Train prediction     441     706        287   793
2          Test set      58     772        107    18
3   Test prediction     211     291        102   351
-------------------------------------------------------------------------


'\nprint("Coefficients: ")\nclf = pd.DataFrame(columns=["equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"],data=classifier.coef_)\nclf["status"] = ["Reject","Poster","Spotlight","Oral"]\nclf = clf[["status","equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"]]\npd.set_option(\'display.width\',1000)\nprint(clf)\n'

In [45]:

x = copy.deepcopy(df)[df.status != "Withdraw"][["num_equations","mean_num_new_symbols_introduced","num_overall_unique_symbols","mean_num_unique_symbols","std_of_unique_symbols","max_representational_complexity"]]
y = copy.deepcopy(df)[df.status != "Withdraw"].status
y = np.asarray(y)
y[y == "Reject"] = 0
y[y == "Poster"] = 1
y[y == "Spotlight"] = 1
y[y == "Oral"] = 1



X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
                                                    random_state=0
                                                    )

X_train,X_test = normalize(X_train, X_test)

y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

classifier = EasyEnsembleClassifier(random_state=0)

classifier.fit(X_train, y_train)

def acc(x,y):
    return np.mean((np.asarray(x) == np.asarray(y)).astype(np.int32))

clf = pd.DataFrame({"Set" : ["Train", "Test"],
                    "Logistic Regression" : [acc(classifier.predict(X_train),y_train),acc(classifier.predict(X_test),y_test)],
                    "Random guess" : [acc(np.random.choice(y_train,size=y_train.size),y_train),
                                      acc(np.random.choice(y_train,size=y_test.size),y_test)],
                    "Predict class Reject" : [acc(0,y_train),acc(0,y_test)],
                    "Predict class Accept" : [acc(1,y_train),acc(1,y_test)]})
print("Accuracies: ")
print(clf)
print("-------------------------------------------------------------------------")
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)
clf = pd.DataFrame({"Set" : ["Train set", "Train prediction", "Test set", "Test prediction"],
                    "Reject" : [np.sum(y_train == 0),np.sum(y_train_pred == 0),np.sum(y_test == 0),np.sum(y_test_pred == 0)],
                    "Accept" : [np.sum(y_train == 1),np.sum(y_train_pred == 1),np.sum(y_test == 1),np.sum(y_test_pred == 1)]})
print("Prediction Distribution: ")
print(clf)
print("-------------------------------------------------------------------------")
print("Coefficients: ")

"""
clf = pd.DataFrame(columns=["equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"],data=np.concatenate([-classifier.coef_,classifier.coef_]))
clf["status"] = ["Reject","Accept"]
clf = clf[["status","equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"]]
pd.set_option('display.width',1000)
print(clf)
"""


Accuracies: 
     Set  Logistic Regression  Random guess  Predict class Reject  Predict class Accept
0  Train             0.528963      0.900763              0.046700              0.953300
1   Test             0.507853      0.897382              0.060733              0.939267
-------------------------------------------------------------------------
Prediction Distribution: 
                Set  Reject  Accept
0         Train set     104    2123
1  Train prediction    1115    1112
2          Test set      58     897
3   Test prediction     476     479
-------------------------------------------------------------------------
Coefficients: 


'\nclf = pd.DataFrame(columns=["equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"],data=np.concatenate([-classifier.coef_,classifier.coef_]))\nclf["status"] = ["Reject","Accept"]\nclf = clf[["status","equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"]]\npd.set_option(\'display.width\',1000)\nprint(clf)\n'