In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Adam import AdamOptim
from SGD import SGD
from utils import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from Logreg import LogisticRegression
from irls_optimizer import IRLS
from sklearn.preprocessing import LabelEncoder
np.seterr(divide = 'ignore') 
np.seterr(invalid='ignore')
np.seterr(over ='ignore')
from sklearn.preprocessing import StandardScaler
from scipy.io import arff

# Biodeg

## Balanced accuracy

In [2]:
arff_file = arff.loadarff('data/big/biodeg.arff')
df = pd.DataFrame(arff_file[0])

FileNotFoundError: [Errno 2] No such file or directory: 'data/big/biodeg.arff'

In [None]:
y = df.iloc[:,-1].astype(int).values -1
X = df.iloc[:,:-1].values

In [None]:
#All
splitting_seeds = [42, 43, 44, 45, 46]

balancedAdam = []
balancedSGD = []
balancedIRLS = []

for seed in splitting_seeds:
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=seed)
    Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=seed)
    
    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)
    Xval = scaler.transform(Xval)
    Xtest = scaler.transform(Xtest)

    
    logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
    logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
    logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
    logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
    logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
    logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=Xtrain.shape[0], X_val=Xval, y_val=yval, patience=5)
    
    predAdam = logAdam.predict(Xtest)
    predSGD = logSGD.predict(Xtest)
    predIRLS = logIRLS.predict(Xtest)
    
    balancedAdam.append(balanced_accuracy_score(ytest,predAdam.round()))
    balancedSGD.append(balanced_accuracy_score(ytest,predSGD.round()))
    balancedIRLS.append(balanced_accuracy_score(ytest,predIRLS.round()))
    

    
biodegBalanced = {"adam": balancedAdam,
                    "sgd": balancedSGD,
                    "irls": balancedIRLS}

In [None]:
biodegBalanced

# Convergence

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=2)
Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=2)

scaler = StandardScaler()
Xtrain = scaler.fit_transform(Xtrain)
Xval = scaler.transform(Xval)

logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=Xtrain.shape[0], X_val=Xval, y_val=yval, patience=5)

biodegConvergence = {"adam": logAdam.get_params()[4],
                       "sgd": logSGD.get_params()[4],
                       "irls": logIRLS.get_params()[4]}

In [None]:
logAdam.plot_loss()

In [None]:
logSGD.plot_loss()

In [None]:
logIRLS.plot_loss()

In [None]:
for key, value in biodegConvergence.items():
    np.save(f"results/loss/{key}/biodeg", np.array(value))

## Comparison with other models

In [None]:
#All
splitting_seeds = [42, 43, 44, 45, 46]

# compAdam = []
# compSGD = []
# compIRLS = []
compLDA = []
compQDA = []
compTree = []
compForest = []

for seed in splitting_seeds:
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=seed)
    
    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)
    Xtest = scaler.transform(Xtest)

    
#     logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
#     logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
#     logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
#     logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, patience=5)
#     logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, patience=5)
#     logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=32, patience=5)
    
#     predAdam = logAdam.predict(Xtest)
#     predSGD = logSGD.predict(Xtest)
#     predIRLS = logIRLS.predict(Xtest)
    
#     compAdam.append(balanced_accuracy_score(ytest,predAdam.round()))
#     compSGD.append(balanced_accuracy_score(ytest,predSGD.round()))
#     compIRLS.append(balanced_accuracy_score(ytest,predIRLS.round()))
    
    lda, qda, tree, forest = fitComparisonModels(Xtrain, ytrain, Xtest)
    
    compLDA.append(balanced_accuracy_score(lda, ytest))
    compQDA.append(balanced_accuracy_score(qda, ytest))
    compTree.append(balanced_accuracy_score(tree, ytest))
    compForest.append(balanced_accuracy_score(forest, ytest))
    
    
biodegComp = { "lda": compLDA,
                "qda": compQDA,
                "dt": compTree,
                "rf": compForest}
    


In [None]:
biodegComp

# Parkinson

## Balanced accuracy

In [None]:
parkinson = pd.read_csv("data/big/parkinsons.csv")
y = parkinson["status"].values
X = parkinson.drop(['name', 'status'], axis=1).values

In [None]:
#All
splitting_seeds = [42, 43, 44, 45, 46]

balancedAdam = []
balancedSGD = []
balancedIRLS = []

for seed in splitting_seeds:
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=seed)
    Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=seed)
    
    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)
    Xval = scaler.transform(Xval)
    Xtest = scaler.transform(Xtest)

    
    logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
    logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
    logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
    logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
    logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
    logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=Xtrain.shape[0], X_val=Xval, y_val=yval, patience=5)
    
    predAdam = logAdam.predict(Xtest)
    predSGD = logSGD.predict(Xtest)
    predIRLS = logIRLS.predict(Xtest)
    
    balancedAdam.append(balanced_accuracy_score(ytest,predAdam.round()))
    balancedSGD.append(balanced_accuracy_score(ytest,predSGD.round()))
    balancedIRLS.append(balanced_accuracy_score(ytest,predIRLS.round()))
    

    
parkinsonBalanced = {"adam": balancedAdam,
                    "sgd": balancedSGD,
                    "irls": balancedIRLS}

In [None]:
parkinsonBalanced = {"adam": balancedAdam,
                    "sgd": balancedSGD,
                    "irls": balancedIRLS}

## Convergence

In [None]:
Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=1)
scaler = StandardScaler()
Xtrain = scaler.fit_transform(Xtrain)
Xval = scaler.transform(Xval)

logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=Xtrain.shape[0], X_val=Xval, y_val=yval, patience=5)

In [None]:
logAdam.plot_loss()

In [None]:
logSGD.plot_loss()

In [None]:
logIRLS.plot_loss()

In [None]:
parkinsonConvergence = {"adam": logAdam.get_params()[4],
                       "sgd": logSGD.get_params()[4],
                       "irls": logIRLS.get_params()[4]}

In [None]:
for key, value in parkinsonConvergence.items():
    np.save(f"results/loss/{key}/parkinson", np.array(value))

## Comparison with other models

In [None]:
#All
splitting_seeds = [42, 43, 44, 45, 46]

# compAdam = []
# compSGD = []
# compIRLS = []
compLDA = []
compQDA = []
compTree = []
compForest = []

for seed in splitting_seeds:
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=seed)
    
    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)
    Xtest = scaler.transform(Xtest)

    
#     logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
#     logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
#     logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
#     logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, patience=5)
#     logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, patience=5)
#     logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=32, patience=5)
    
#     predAdam = logAdam.predict(Xtest)
#     predSGD = logSGD.predict(Xtest)
#     predIRLS = logIRLS.predict(Xtest)
    
#     compAdam.append(balanced_accuracy_score(ytest,predAdam.round()))
#     compSGD.append(balanced_accuracy_score(ytest,predSGD.round()))
#     compIRLS.append(balanced_accuracy_score(ytest,predIRLS.round()))
    
    lda, qda, tree, forest = fitComparisonModels(Xtrain, ytrain, Xtest)
    
    compLDA.append(balanced_accuracy_score(lda, ytest))
    compQDA.append(balanced_accuracy_score(qda, ytest))
    compTree.append(balanced_accuracy_score(tree, ytest))
    compForest.append(balanced_accuracy_score(forest, ytest))
    
    
parkinsonComp = {#"adam": compAdam,
                #"sgd": compSGD,
                #"irls": compIRLS,
                "lda": compLDA,
                "qda": compQDA,
                "dt": compTree,
                "rf": compForest}
    


In [None]:
parkinsonComp

# Diabetes

## Balanced accuracy

In [None]:
diabetes = pd.read_csv("data/small/diabetes.csv")
y = diabetes["Outcome"].values
X = diabetes.iloc[:, :-1].values

In [None]:
#All
splitting_seeds = [42, 43, 44, 45, 46]

balancedAdam = []
balancedSGD = []
balancedIRLS = []

for seed in splitting_seeds:
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=seed)
    Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=seed)
    
    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)
    Xval = scaler.transform(Xval)
    Xtest = scaler.transform(Xtest)

    
    logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
    logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
    logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
    logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
    logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
    logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=Xtrain.shape[0], X_val=Xval, y_val=yval, patience=5)
    
    predAdam = logAdam.predict(Xtest)
    predSGD = logSGD.predict(Xtest)
    predIRLS = logIRLS.predict(Xtest)
    
    balancedAdam.append(balanced_accuracy_score(ytest,predAdam.round()))
    balancedSGD.append(balanced_accuracy_score(ytest,predSGD.round()))
    balancedIRLS.append(balanced_accuracy_score(ytest,predIRLS.round()))

diabetesBalanced = {"adam": balancedAdam,
                    "sgd": balancedSGD,
                    "irls": balancedIRLS}

In [None]:
print("adam:", balancedAdam)
print("sgd:", balancedSGD)
print("IRLS:", balancedIRLS)

## Convergence

In [None]:
Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=1)
scaler = StandardScaler()
Xtrain = scaler.fit_transform(Xtrain)
Xval = scaler.transform(Xval)

logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=Xtrain.shape[0], X_val=Xval, y_val=yval, patience=5)

In [None]:
logAdam.plot_loss()

In [None]:
logSGD.plot_loss()

In [None]:
logIRLS.plot_loss()

In [None]:
diabetesConvergence = {"adam": logAdam.get_params()[4],
                       "sgd": logSGD.get_params()[4],
                       "irls": logIRLS.get_params()[4]}

In [None]:
for key, value in diabetesConvergence.items():
    np.save(f"results/loss/{key}/diabetes", np.array(value))

## Comparison with othe models

In [None]:
#All
splitting_seeds = [42, 43, 44, 45, 46]

# compAdam = []
# compSGD = []
# compIRLS = []
compLDA = []
compQDA = []
compTree = []
compForest = []

for seed in splitting_seeds:
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=seed)
    
    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)
    Xtest = scaler.transform(Xtest)

    
#     logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
#     logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
#     logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
#     logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, patience=5)
#     logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, patience=5)
#     logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=32, patience=5)
    
#     predAdam = logAdam.predict(Xtest)
#     predSGD = logSGD.predict(Xtest)
#     predIRLS = logIRLS.predict(Xtest)
    
#     compAdam.append(balanced_accuracy_score(ytest,predAdam.round()))
#     compSGD.append(balanced_accuracy_score(ytest,predSGD.round()))
#     compIRLS.append(balanced_accuracy_score(ytest,predIRLS.round()))
    
    lda, qda, tree, forest = fitComparisonModels(Xtrain, ytrain, Xtest)
    
    compLDA.append(balanced_accuracy_score(lda, ytest))
    compQDA.append(balanced_accuracy_score(qda, ytest))
    compTree.append(balanced_accuracy_score(tree, ytest))
    compForest.append(balanced_accuracy_score(forest, ytest))
    
    
diabetesComp = {#"adam": compAdam,
                #"sgd": compSGD,
                #"irls": compIRLS,
                "lda": compLDA,
                "qda": compQDA,
                "dt": compTree,
                "rf": compForest}
    


In [None]:
diabetesComp

## Interactions

In [None]:
Xint = createFeatureInteractions(X)

In [None]:
splitting_seeds = [42, 43, 44, 45, 46]

intAdam = []
intSGD = []
intIRLS = []

for seed in splitting_seeds:
    Xtrain, Xtest, ytrain, ytest = train_test_split(Xint, y, test_size=0.2, random_state=seed)
    Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=seed)
    
    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)
    Xval = scaler.transform(Xval)
    Xtest = scaler.transform(Xtest)

    
    logAdam = LogisticRegression(input_dim=Xtrain.shape[1])
    logSGD = LogisticRegression(input_dim=Xtrain.shape[1])
    logIRLS = LogisticRegression(input_dim=Xtrain.shape[1])
    logAdam.train(Xtrain, ytrain, optimizer=AdamOptim(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
    logSGD.train(Xtrain, ytrain, optimizer=SGD(eta=0.01), epochs=500, batch_size=32, X_val=Xval, y_val=yval, patience=5)
    logIRLS.train(Xtrain, ytrain, optimizer=IRLS(), epochs=500, batch_size=Xtrain.shape[0], X_val=Xval, y_val=yval, patience=5)
    
    predAdam = logAdam.predict(Xtest)
    predSGD = logSGD.predict(Xtest)
    predIRLS = logIRLS.predict(Xtest)
    
    intAdam.append(balanced_accuracy_score(ytest,predAdam.round()))
    intSGD.append(balanced_accuracy_score(ytest,predSGD.round()))
    intIRLS.append(balanced_accuracy_score(ytest,predIRLS.round()))
    

    
diabetesInt = {"adam": intAdam,
                    "sgd": intSGD,
                    "irls": intIRLS}

In [None]:
diabetesInt

In [None]:
diabetesIntConvergence = {"adam": logAdam.get_params()[4],
                       "sgd": logSGD.get_params()[4],
                       "irls": logIRLS.get_params()[4]}

In [None]:
for key, value in diabetesIntConvergence.items():
    np.save(f"results/loss/{key}/diabetesInt", np.array(value))

# wraping up the results

In [None]:
balancedAccuracy = pd.DataFrame(columns=["data_set", "optimizer", "seed_42_acc", "seed_43_acc", "seed_44_acc", "seed_45_acc", "seed_46_acc"])

In [None]:
balanced = {"biodeg": biodegBalanced,
           "parkinson": parkinsonBalanced,
           "diabetes": diabetesBalanced}
for dataSetName, value in balanced.items():
    for opt, acc in value.items():
        balancedAccuracy.loc[balancedAccuracy.shape[0]] = [dataSetName+"_data", opt] + acc

In [None]:
comp = {"biodeg": biodegComp,
           "parkinson": parkinsonComp,
           "diabetes": diabetesComp}
for dataSetName, value in comp.items():
    for opt, acc in value.items():
        balancedAccuracy.loc[balancedAccuracy.shape[0]] = [dataSetName+"_data", opt] + acc

In [None]:
for opt, acc in diabetesInt.items():
    balancedAccuracy.loc[balancedAccuracy.shape[0]] = ["diabetes_data", opt+"+int"] + acc

In [None]:
balancedAccuracy

In [None]:
balancedAccuracy.to_csv("results/balancedAccuracy.csv", index=False)