In [36]:
import os
import pandas as pd
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import copy 

df = pd.read_csv("../feature_extraction/iclr.csv")

df = df.sample(frac=1)

In [37]:
def normalize(train, test):
    mean = np.mean(train,axis=0)
    std = np.std(train,axis=0)

    return (train-mean)/std,(test-mean)/std

In [38]:
x = copy.deepcopy(df)[["num_equations","mean_num_new_symbols_introduced","num_overall_unique_symbols","mean_num_unique_symbols","std_of_unique_symbols","max_representational_complexity"]]
y = copy.deepcopy(df).recommendation_avg

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
                                                    random_state=0
                                                    )

X_train,X_test = normalize(X_train, X_test)

linreg = LinearRegression()
linreg.fit(X_train, y_train)

def norm(x,y):
    return np.mean((np.asarray(x)-np.asarray(y))**2)

print("Train error Linear Regression: {0}".format(norm(linreg.predict(X_train),y_train)))
print("Test error Linear Regression: {0}".format(norm(linreg.predict(X_test),y_test)))
print("Test error Avg: {0}".format(norm(np.mean(y_train),y_test)))
print("-------------------------------------------------------------------------")
cdf = pd.DataFrame(linreg.coef_, X_train.columns, columns=['Coefficients'])
print(cdf)

Train error Linear Regression: 1.7094292090965044
Test error Linear Regression: 1.6808854398560353
Test error Avg: 1.849003410858768
-------------------------------------------------------------------------
                                 Coefficients
num_equations                        0.113223
mean_num_new_symbols_introduced     -0.220332
num_overall_unique_symbols           0.174706
mean_num_unique_symbols             -0.094790
std_of_unique_symbols                0.186908
max_representational_complexity     -0.152643


In [39]:

x = copy.deepcopy(df)[["num_equations","mean_num_new_symbols_introduced","num_overall_unique_symbols","mean_num_unique_symbols","std_of_unique_symbols","max_representational_complexity"]]
y = copy.deepcopy(df).status
y = np.asarray(y)
y[y == "Withdraw"] = 0
y[y == "Reject"] = 1
y[y == "Desk Reject"] = 2
y[y == "Top-25%"] = 3
y[y == "Top-5%"] = 4
y[y == "Poster"] = 5



X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
                                                    random_state=0
                                                    )

X_train,X_test = normalize(X_train, X_test)

y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

logreg = LogisticRegression(max_iter=50000,solver="saga")
logreg.fit(X_train, y_train)

def acc(x,y):
    return np.mean((np.asarray(x) == np.asarray(y)).astype(np.int32))

clf = pd.DataFrame({"Set" : ["Train", "Test"],
                    "Logistic Regression" : [acc(logreg.predict(X_train),y_train),acc(logreg.predict(X_test),y_test)],
                    "Random guess" : [acc(np.random.choice(y_train,size=y_train.size),y_train),
                                      acc(np.random.choice(y_train,size=y_test.size),y_test)],
                    "Predict class Withdraw" : [acc(0,y_train),acc(0,y_test)],
                    "Predict class Reject" : [acc(1,y_train),acc(1,y_test)],
                    "Predict class Desk Reject" : [acc(2,y_train),acc(2,y_test)],
                    "Predict class Top-25%" : [acc(3,y_train),acc(3,y_test)],
                    "Predict class Top-5%" : [acc(4,y_train),acc(4,y_test)],
                    "Predict class Poster" : [acc(5,y_train),acc(5,y_test)],
                    "Random guess Reject or Poster" : [acc(np.random.choice(np.delete(y_train,(y_train == 0) | (y_train == 2) | (y_train == 3) | (y_train == 4)),size=y_train.size),y_train),
                                                       acc(np.random.choice(np.delete(y_train,(y_train == 0) | (y_train == 2) | (y_train == 3) | (y_train == 4)),size=y_test.size),y_test)],
                    "Random guess Withdraw, Reject or Poster" : [acc(np.random.choice(np.delete(y_train,(y_train == 2) | (y_train == 3) | (y_train == 4)),size=y_train.size),y_train),
                                                       acc(np.random.choice(np.delete(y_train,(y_train == 2) | (y_train == 3) | (y_train == 4)),size=y_test.size),y_test)]})
print("Accuracies: ")
print(clf)
print("-------------------------------------------------------------------------")
y_train_pred = logreg.predict(X_train)
y_test_pred = logreg.predict(X_test)
clf = pd.DataFrame({"Set" : ["Train set", "Train prediction", "Test set", "Test prediction"],
                    "Withdraw" : [np.sum(y_train == 0),np.sum(y_train_pred == 0),np.sum(y_test == 0),np.sum(y_test_pred == 0)],
                    "Reject" : [np.sum(y_train == 1),np.sum(y_train_pred == 1),np.sum(y_test == 1),np.sum(y_test_pred == 1)],
                    "Desk Reject" : [np.sum(y_train == 2),np.sum(y_train_pred == 2),np.sum(y_test == 2),np.sum(y_test_pred == 2)],
                    "Top-25%" : [np.sum(y_train == 3),np.sum(y_train_pred == 3),np.sum(y_test == 3),np.sum(y_test_pred == 3)],
                    "Top-5%" : [np.sum(y_train == 4),np.sum(y_train_pred == 4),np.sum(y_test == 4),np.sum(y_test_pred == 4)],
                    "Poster" : [np.sum(y_train == 5),np.sum(y_train_pred == 5),np.sum(y_test == 5),np.sum(y_test_pred == 5)]})
print("Prediction Distribution: ")
print(clf)
print("-------------------------------------------------------------------------")
print("Coefficients: ")
clf = pd.DataFrame(columns=["equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"],data=logreg.coef_)
clf["status"] = ["Withdraw","Reject","Desk Reject","Top-25%","Top-5%","Poster"]
clf = clf[["status","equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"]]
pd.set_option('display.width',1000)
print(clf)


Accuracies: 
     Set  Logistic Regression  Random guess  Predict class Withdraw  Predict class Reject  Predict class Desk Reject  Predict class Top-25%  Predict class Top-5%  Predict class Poster  Random guess Reject or Poster  Random guess Withdraw, Reject or Poster
0  Train             0.401009      0.296343                0.160151              0.361917                   0.006305               0.085750              0.023960              0.361917                       0.370744                                 0.325347
1   Test             0.442815      0.310850                0.131965              0.434018                   0.000000               0.064516              0.032258              0.337243                       0.410557                                 0.392962
-------------------------------------------------------------------------
Prediction Distribution: 
                Set  Withdraw  Reject  Desk Reject  Top-25%  Top-5%  Poster
0         Train set       127     287      

In [None]:

x = copy.deepcopy(df)[df.status != "Withdraw"][["num_equations","mean_num_new_symbols_introduced","num_overall_unique_symbols","mean_num_unique_symbols","std_of_unique_symbols","max_representational_complexity"]]
y = copy.deepcopy(df)[df.status != "Withdraw"].status
y = np.asarray(y)
y[y == "Reject"] = 0
y[y == "Desk Reject"] = 0
y[y == "Top-25%"] = 1
y[y == "Top-5%"] = 1
y[y == "Poster"] = 1



X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
                                                    random_state=0
                                                    )

X_train,X_test = normalize(X_train, X_test)

y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

logreg = LogisticRegression(max_iter=50000,solver="lbfgs")
logreg.fit(X_train, y_train)

def acc(x,y):
    return np.mean((np.asarray(x) == np.asarray(y)).astype(np.int32))

clf = pd.DataFrame({"Set" : ["Train", "Test"],
                    "Logistic Regression" : [acc(logreg.predict(X_train),y_train),acc(logreg.predict(X_test),y_test)],
                    "Random guess" : [acc(np.random.choice(y_train,size=y_train.size),y_train),
                                      acc(np.random.choice(y_train,size=y_test.size),y_test)],
                    "Predict class Reject" : [acc(0,y_train),acc(0,y_test)],
                    "Predict class Accept" : [acc(1,y_train),acc(1,y_test)]})
print("Accuracies: ")
print(clf)
print("-------------------------------------------------------------------------")
y_train_pred = logreg.predict(X_train)
y_test_pred = logreg.predict(X_test)
clf = pd.DataFrame({"Set" : ["Train set", "Train prediction", "Test set", "Test prediction"],
                    "Reject" : [np.sum(y_train == 0),np.sum(y_train_pred == 0),np.sum(y_test == 0),np.sum(y_test_pred == 0)],
                    "Accept" : [np.sum(y_train == 1),np.sum(y_train_pred == 1),np.sum(y_test == 1),np.sum(y_test_pred == 1)]})
print("Prediction Distribution: ")
print(clf)
print("-------------------------------------------------------------------------")
print("Coefficients: ")

clf = pd.DataFrame(columns=["equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"],data=logreg.coef_)
clf = clf[["equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"]]
pd.set_option('display.width',1000)
print(clf)


Accuracies: 
     Set  Logistic Regression  Random guess  Predict class Reject  Predict class Accept
0  Train             0.586924      0.540862              0.462110              0.537890
1   Test             0.512111      0.456747              0.446367              0.553633
-------------------------------------------------------------------------
Prediction Distribution: 
                Set  Reject  Accept
0         Train set     311     362
1  Train prediction     265     408
2          Test set     129     160
3   Test prediction     104     185
-------------------------------------------------------------------------
Coefficients: 
   equations  mean_num_nsi  num_unique  mean_num_unique  std_unique  complexity
0   0.000566     -1.283642    0.003594         0.051053    0.181072     -0.0482
