In [27]:
import os
import pandas as pd
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import copy 

df = pd.read_csv("../feature_extraction/iclr.csv")

df = df.sample(frac=1)

In [30]:
x = copy.deepcopy(df)[["num_equations","mean_num_new_symbols_introduced","num_overall_unique_symbols","mean_num_unique_symbols","std_of_unique_symbols","max_representational_complexity"]]
y = copy.deepcopy(df).recommendation_avg

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1,
                                                    random_state=0
                                                    )

linreg = LinearRegression()
linreg.fit(X_train, y_train)

def norm(x,y):
    return np.mean((np.asarray(x)-np.asarray(y))**2)

print("Train error Linear Regression: {0}".format(norm(linreg.predict(X_train),y_train)))
print("Test error Linear Regression: {0}".format(norm(linreg.predict(X_test),y_test)))
print("Test error Avg: {0}".format(norm(np.mean(y_train),y_test)))
print("-------------------------------------------------------------------------")
cdf = pd.DataFrame(linreg.coef_, X_train.columns, columns=['Coefficients'])
print(cdf)

Traom error Linear Regression: 1.7319079524980154
Test error Linear Regression: 1.421142220880844
Test error Avg: 1.6133419337861847
-------------------------------------------------------------------------
                                 Coefficients
num_equations                        0.000256
mean_num_new_symbols_introduced     -1.313887
num_overall_unique_symbols           0.006394
mean_num_unique_symbols             -0.044681
std_of_unique_symbols                0.108664
max_representational_complexity     -0.024518


In [None]:

x = copy.deepcopy(df)[["num_equations","mean_num_new_symbols_introduced","num_overall_unique_symbols","mean_num_unique_symbols","std_of_unique_symbols","max_representational_complexity"]]
y = copy.deepcopy(df).status
y = np.asarray(y)
y[y == "Withdraw"] = 0
y[y == "Reject"] = 1
y[y == "Desk Reject"] = 2
y[y == "Top-25%"] = 3
y[y == "Top-5%"] = 4
y[y == "Poster"] = 5



X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
                                                    random_state=0
                                                    )
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

logreg = LogisticRegression(max_iter=50000)
logreg.fit(X_train, y_train)

def acc(x,y):
    return np.mean((np.asarray(x) == np.asarray(y)).astype(np.int32))

clf = pd.DataFrame({"Accuracy" : ["Train", "Test"],
                    "Logistic Regression" : [acc(logreg.predict(X_train),y_train),acc(logreg.predict(X_test),y_test)],
                    "Random guess" : [acc(np.random.choice(y_train,size=y_train.size),y_train),acc(np.random.choice(y_train,size=y_test.size),y_test)],
                    "Predict class Withdraw" : [acc(0,y_train),acc(0,y_test)],
                    "Predict class Reject" : [acc(1,y_train),acc(1,y_test)],
                    "Predict class Desk Reject" : [acc(2,y_train),acc(2,y_test)],
                    "Predict class Top-25%" : [acc(3,y_train),acc(3,y_test)],
                    "Predict class Top-5%" : [acc(4,y_train),acc(4,y_test)],
                    "Predict class Poster" : [acc(5,y_train),acc(5,y_test)],
                    "Random guess Reject or Poster" : [acc(np.random.choice(np.delete(y_train,(y_train == 0) | (y_train == 2) | (y_train == 3) | (y_train == 4)),size=y_train.size),y_train),acc(np.random.choice(np.delete(y_train,(y_train == 0) | (y_train == 2) | (y_train == 3) | (y_train == 4)),size=y_test.size),y_test)],})
print("Accuracies: ")
print(clf)
print("-------------------------------------------------------------------------")
y_train_pred = logreg.predict(X_train)
y_test_pred = logreg.predict(X_test)
clf = pd.DataFrame({"Set" : ["Train set", "Train prediction", "Test set", "Test prediction"],
                    "Withdraw" : [np.sum(y_train == 0),np.sum(y_train_pred == 0),np.sum(y_test == 0),np.sum(y_test_pred == 0)],
                    "Reject" : [np.sum(y_train == 1),np.sum(y_train_pred == 1),np.sum(y_test == 1),np.sum(y_test_pred == 1)],
                    "Desk Reject" : [np.sum(y_train == 2),np.sum(y_train_pred == 2),np.sum(y_test == 2),np.sum(y_test_pred == 2)],
                    "Top-25%" : [np.sum(y_train == 3),np.sum(y_train_pred == 3),np.sum(y_test == 3),np.sum(y_test_pred == 3)],
                    "Top-5%" : [np.sum(y_train == 4),np.sum(y_train_pred == 4),np.sum(y_test == 4),np.sum(y_test_pred == 4)],
                    "Poster" : [np.sum(y_train == 5),np.sum(y_train_pred == 5),np.sum(y_test == 5),np.sum(y_test_pred == 5)]})
print("Prediction Distribution: ")
print(clf)
print("-------------------------------------------------------------------------")
print(logreg.coef_.shape)
print("Coefficients: ")
clf = pd.DataFrame(columns=["equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"],data=logreg.coef_)
clf["status"] = ["Withdraw","Reject","Desk Reject","Top-25%","Top-5%","Poster"]
clf = clf[["status","equations","mean_num_nsi","num_unique","mean_num_unique","std_unique","complexity"]]
pd.set_option('display.width',1000)
print(clf)


Accuracies: 
  Accuracy  Logistic Regression  Random guess  Predict class Withdraw  Predict class Reject  Predict class Desk Reject  Predict class Top-25%  Predict class Top-5%  Predict class Poster  Random guess Reject or Poster
0    Train             0.442623      0.281211                0.155107              0.379571                   0.002522               0.085750              0.017654              0.359395                       0.354351
1     Test             0.395894      0.275660                0.143695              0.392962                   0.008798               0.064516              0.046921              0.343109                       0.325513
-------------------------------------------------------------------------
Prediction Distribution: 
                Set  Withdraw  Reject  Desk Reject  Top-25%  Top-5%  Poster
0         Train set       123     301            2       68      14     285
1  Train prediction        38     449            0        0       1     305
2       

STOP: TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
