In [37]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from ucimlrepo import fetch_ucirepo 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from scipy.io.arff import loadarff 
from tqdm import tqdm

from utils import preprocess, drop_colinear
from models import LogisticRegressionIWLS, LogisticRegressionSGD, LogisticRegressionAdam 

TH = 5
SEED = 1337

np.random.seed(SEED)

In [38]:
datasets = {}

## Small

In [39]:
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) # https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [40]:
df["Class"] = df["Class"].map({2: 0, 4: 1})
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["breast_cancer_wisconsin_original"] = (X, y)

Missing data: 0.23%
(683, 5) (683,) 0 2


In [41]:
df = pd.read_csv('data/SAheart.data', index_col=0).reset_index(drop=True)
df.head()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,1
1,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
3,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,1
4,134,13.6,3.5,27.78,Present,60,25.99,57.34,49,1


In [42]:
df["famhist"] = df["famhist"].map({"Present": 1, "Absent": 0})
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["SAheart"] = (X, y)

Missing data: 0.00%
(462, 4) (462,) 0 2


In [43]:
tic_tac_toe_endgame = fetch_ucirepo(id=101) # https://archive.ics.uci.edu/dataset/101/tic+tac+toe+endgame
X = tic_tac_toe_endgame.data.features 
y = tic_tac_toe_endgame.data.targets 
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [44]:
for col in df.columns:
    df[col] = df[col].map({"x": 2.0, "o": 1.0, "b": 0.0, "positive": 1, "negative": 0})
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["tic_tac_toe_endgame"] = (X, y)

Missing data: 0.00%
(958, 9) (958,) 0 2


## Large

In [45]:
musk_version_1 = fetch_ucirepo(id=74) # https://archive.ics.uci.edu/dataset/74/musk+version+1
X = musk_version_1.data.features 
y = musk_version_1.data.targets 
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,molecule_name,conformation_name,f1,f2,f3,f4,f5,f6,f7,f8,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
0,MUSK-188,188_1+1,42,-198,-109,-75,-117,11,23,-88,...,-74,-129,-120,-38,30,48,-37,6,30,1.0
1,MUSK-188,188_1+2,42,-191,-142,-65,-117,55,49,-170,...,-302,60,-120,-39,31,48,-37,5,30,1.0
2,MUSK-188,188_1+3,42,-191,-142,-75,-117,11,49,-161,...,-73,-127,-120,-38,30,48,-37,5,31,1.0
3,MUSK-188,188_1+4,42,-198,-110,-65,-117,55,23,-95,...,-302,60,-120,-39,30,48,-37,6,30,1.0
4,MUSK-190,190_1+1,42,-198,-102,-75,-117,10,24,-87,...,-73,-127,51,128,144,43,-30,14,26,1.0


In [46]:
df.drop(columns=["molecule_name", "conformation_name"], inplace=True)
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["musk_version_1"] = (X, y)

Missing data: 0.00%
(476, 29) (476,) 0 2


In [48]:
connectionist_bench_sonar_mines_vs_rocks = fetch_ucirepo(id=151) # https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks
X = connectionist_bench_sonar_mines_vs_rocks.data.features 
y = connectionist_bench_sonar_mines_vs_rocks.data.targets 
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,...,Attribute52,Attribute53,Attribute54,Attribute55,Attribute56,Attribute57,Attribute58,Attribute59,Attribute60,class
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [49]:
df["class"] = df["class"].map({"M": 1, "R": 0})
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["connectionist_bench_sonar_mines_vs_rocks"] = (X, y)

Missing data: 0.00%


(208, 14) (208,) 0 2


In [50]:
raw_data = loadarff('data/dataset_31_credit-g.arff') # https://www.openml.org/search?type=data&sort=runs&status=active&qualities.NumberOfFeatures=between_10_100&qualities.NumberOfClasses=%3D_2&id=31
df = pd.DataFrame(raw_data[0])
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,b'<0',6.0,b'critical/other existing credit',b'radio/tv',1169.0,b'no known savings',b'>=7',4.0,b'male single',b'none',...,b'real estate',67.0,b'none',b'own',2.0,b'skilled',1.0,b'yes',b'yes',b'good'
1,b'0<=X<200',48.0,b'existing paid',b'radio/tv',5951.0,b'<100',b'1<=X<4',2.0,b'female div/dep/mar',b'none',...,b'real estate',22.0,b'none',b'own',1.0,b'skilled',1.0,b'none',b'yes',b'bad'
2,b'no checking',12.0,b'critical/other existing credit',b'education',2096.0,b'<100',b'4<=X<7',2.0,b'male single',b'none',...,b'real estate',49.0,b'none',b'own',1.0,b'unskilled resident',2.0,b'none',b'yes',b'good'
3,b'<0',42.0,b'existing paid',b'furniture/equipment',7882.0,b'<100',b'4<=X<7',2.0,b'male single',b'guarantor',...,b'life insurance',45.0,b'none',b'for free',1.0,b'skilled',2.0,b'none',b'yes',b'good'
4,b'<0',24.0,b'delayed previously',b'new car',4870.0,b'<100',b'1<=X<4',3.0,b'male single',b'none',...,b'no known property',53.0,b'none',b'for free',2.0,b'skilled',2.0,b'none',b'yes',b'bad'


In [51]:
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].astype("category").cat.codes
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["credit-g"] = (X, y)

Missing data: 0.00%
(1000, 11) (1000,) 0 2


In [52]:
student_performance = fetch_ucirepo(id=320) # https://archive.ics.uci.edu/dataset/320/student+performance
X = student_performance.data.features 
y = student_performance.data.targets 
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [53]:
df.drop(columns=["G1", "G2"], inplace=True)
df["G3"] = np.where(df["G3"] >= 10, 1, 0)
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].astype("category").cat.codes
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["student_performance"] = (X, y)

Missing data: 0.00%
(649, 18) (649,) 0 2


In [54]:
statlog_image_segmentation = fetch_ucirepo(id=147) # https://archive.ics.uci.edu/dataset/147/statlog+image+segmentation
X = statlog_image_segmentation.data.features 
y = statlog_image_segmentation.data.targets 
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,region-centroid-col,region-centroid-row,region-pixel-count,short-line-density-5,short-line-density-2,vedge-mean,vedge-sd,hedge-mean,hedge-sd,intensity-mean,rawred-mean,rawblue-mean,rawgreen-mean,exred-mean,exblue-mean,exgreen-mean,value-mean,saturation-mean,hue-mean,class
0,218,178,9,0.111111,0.0,0.833333,0.547722,1.111109,0.544331,59.62963,52.444443,75.22222,51.22222,-21.555555,46.77778,-25.222221,75.22222,0.318996,-2.040554,6
1,113,130,9,0.0,0.0,0.277778,0.250924,0.333333,0.365148,0.888889,0.0,2.555556,0.111111,-2.666667,5.0,-2.333333,2.555556,1.0,-2.123254,3
2,202,41,9,0.0,0.0,0.944448,0.772202,1.111112,1.025597,123.03704,111.888885,139.77779,117.44444,-33.444443,50.22222,-16.777779,139.77779,0.199347,-2.299918,2
3,32,173,9,0.0,0.0,1.722222,1.781593,9.0,6.749488,43.592594,39.555557,52.88889,38.333336,-12.111111,27.88889,-15.777778,52.88889,0.266914,-1.998857,6
4,61,197,9,0.0,0.0,1.444444,1.515353,2.611111,1.925463,49.592594,44.22222,61.555557,43.0,-16.11111,35.88889,-19.777779,61.555557,0.302925,-2.022274,6


In [55]:
df["class"] = np.where(df["class"] >= 4, 1, 0)
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["statlog_image_segmentation"] = (X, y)

Missing data: 0.00%
(2310, 11) (2310,) 0 2


In [56]:
waveform_database_generator_version_1 = fetch_ucirepo(id=107) # https://archive.ics.uci.edu/dataset/107/waveform+database+generator+version+1
X = waveform_database_generator_version_1.data.features 
y = waveform_database_generator_version_1.data.targets 
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,...,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20,Attribute21,class
0,-1.23,-1.56,-1.75,-0.28,0.6,2.22,0.85,0.21,-0.2,0.89,...,2.89,7.75,4.59,3.15,5.12,3.32,1.2,0.24,-0.56,2
1,-0.69,2.43,0.61,2.08,2.3,3.25,5.52,4.55,2.97,2.22,...,1.24,1.89,1.88,-1.34,0.83,1.41,1.78,0.6,2.42,1
2,-0.12,-0.94,1.29,2.59,2.42,3.55,4.94,3.25,1.9,2.07,...,2.5,0.12,1.41,2.78,0.64,0.62,-0.01,-0.79,-0.12,0
3,0.86,0.29,2.19,-0.02,1.13,2.51,2.37,5.45,5.45,4.84,...,2.58,1.4,1.24,1.41,1.07,-1.43,2.84,-1.18,1.12,1
4,1.16,0.37,0.4,-0.59,2.66,1.0,2.69,4.06,5.34,3.53,...,4.3,1.84,1.73,0.21,-0.18,0.13,-0.21,-0.8,-0.68,1


In [57]:
df["class"] = np.where(df["class"] >= 2, 1, 0)
X, y = preprocess(df, TH)
print(X.shape, y.shape, X.isna().sum().sum(), len(y.unique()))
datasets["waveform_database_generator_version_1"] = (X, y)

Missing data: 0.00%
(5000, 14) (5000,) 0 2


In [58]:
splits = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
results = []
for key in tqdm(list(datasets.keys())):
    X, y = datasets[key]
    for split in splits:
        scaler = StandardScaler()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=SEED)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test)

        model = LogisticRegressionIWLS(max_iter=500, tol=1e-3)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = balanced_accuracy_score(y_test, preds)
        results.append([key, "IWLS", split, acc, it, hist])

        model = LogisticRegressionSGD(learning_rate=0.01, max_iter=500, tol=1e-3)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = balanced_accuracy_score(y_test, preds)
        results.append([key, "SGD", split, acc, it, hist])

        model = LogisticRegressionAdam(learning_rate=0.01, max_iter=500, tol=1e-3)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = balanced_accuracy_score(y_test, preds)
        results.append([key, "Adam", split, acc, it, hist])

        model = LinearDiscriminantAnalysis()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = balanced_accuracy_score(y_test, preds)
        results.append([key, "LDA", split, acc, None, None])
        
        model = QuadraticDiscriminantAnalysis()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = balanced_accuracy_score(y_test, preds)
        results.append([key, "QDA", split, acc, None, None])

        model = DecisionTreeClassifier()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = balanced_accuracy_score(y_test, preds)
        results.append([key, "DecisionTree", split, acc, None, None])

        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = balanced_accuracy_score(y_test, preds)
        results.append([key, "RandomForest", split, acc, None, None])

  0%|          | 0/9 [00:00<?, ?it/s]

100%|██████████| 9/9 [01:46<00:00, 11.85s/it]


In [59]:
results = pd.DataFrame(results, columns=["Dataset", "Method", "Split", "Accuracy", "Iterations", "History"])
results

Unnamed: 0,Dataset,Method,Split,Accuracy,Iterations,History
0,breast_cancer_wisconsin_original,IWLS,0.1,0.945652,8.0,"[0.6931471805599453, 0.2166210628934026, 0.128..."
1,breast_cancer_wisconsin_original,SGD,0.1,0.945652,86.0,"[0.6931471805599453, 0.15543621681892533, 0.11..."
2,breast_cancer_wisconsin_original,Adam,0.1,0.945652,500.0,"[0.6931471805599453, 0.6757739686550137, 0.658..."
3,breast_cancer_wisconsin_original,LDA,0.1,0.945652,,
4,breast_cancer_wisconsin_original,QDA,0.1,0.934783,,
...,...,...,...,...,...,...
562,waveform_database_generator_version_1,Adam,0.5,0.853866,500.0,"[0.6931471805599454, 0.6712344131307395, 0.650..."
563,waveform_database_generator_version_1,LDA,0.5,0.871977,,
564,waveform_database_generator_version_1,QDA,0.5,0.858522,,
565,waveform_database_generator_version_1,DecisionTree,0.5,0.784820,,


In [60]:
results.to_csv("../results/results.csv", index=False)

In [61]:
splits = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
results = []
for key in tqdm(list(datasets.keys())[:3]):
    X, y = datasets[key]
    for split in splits:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=SEED)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test)

        model = LogisticRegressionIWLS(max_iter=500, tol=1e-3)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = (preds == y_test).mean()
        results.append([key, "IWLS", split, acc, it, hist])

        model = LogisticRegressionSGD(learning_rate=0.01, max_iter=500, tol=1e-3)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = (preds == y_test).mean()
        results.append([key, "SGD", split, acc, it, hist])

        model = LogisticRegressionAdam(learning_rate=0.01, max_iter=500, tol=1e-3)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = (preds == y_test).mean()
        results.append([key, "Adam", split, acc, it, hist])

        model = LogisticRegressionIWLS(max_iter=500, tol=1e-3, intersections=True)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = (preds == y_test).mean()
        results.append([key, "IWLS+INT", split, acc, it, hist])

        model = LogisticRegressionSGD(learning_rate=0.01, max_iter=500, tol=1e-3, intersections=True)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = (preds == y_test).mean()
        results.append([key, "SGD+INT", split, acc, it, hist])

        model = LogisticRegressionAdam(learning_rate=0.01, max_iter=500, tol=1e-3, intersections=True)
        hist, it = model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = (preds == y_test).mean()
        results.append([key, "Adam+INT", split, acc, it, hist])

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:46<00:00, 15.36s/it]


In [62]:
results = pd.DataFrame(results, columns=["Dataset", "Method", "Split", "Accuracy", "Iterations", "History"])
results

Unnamed: 0,Dataset,Method,Split,Accuracy,Iterations,History
0,breast_cancer_wisconsin_original,IWLS,0.1,0.956522,8,"[0.6931471805599453, 0.2166210628934026, 0.128..."
1,breast_cancer_wisconsin_original,SGD,0.1,0.956522,86,"[0.6931471805599453, 0.1535567473141377, 0.116..."
2,breast_cancer_wisconsin_original,Adam,0.1,0.956522,500,"[0.6931471805599453, 0.6757739686550137, 0.658..."
3,breast_cancer_wisconsin_original,IWLS+INT,0.1,0.956522,10,"[0.6931471805599453, 0.19229651468922895, 0.11..."
4,breast_cancer_wisconsin_original,SGD+INT,0.1,0.956522,389,"[0.6931471805599453, 0.14974109442423242, 0.11..."
...,...,...,...,...,...,...
157,tic_tac_toe_endgame,SGD,0.5,0.686848,9,"[0.6931471805599453, 0.6352383521771281, 0.627..."
158,tic_tac_toe_endgame,Adam,0.5,0.686848,98,"[0.6931471805599453, 0.6888184746659177, 0.684..."
159,tic_tac_toe_endgame,IWLS+INT,0.5,0.776618,6,"[0.6931471805599453, 0.46071362510733394, 0.42..."
160,tic_tac_toe_endgame,SGD+INT,0.5,0.770355,61,"[0.6931471805599453, 0.498939962042412, 0.4639..."


In [63]:
results.to_csv("../results/results2.csv", index=False)