## Load packages

In [None]:
import numpy as np
import pylab as plt
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import colorcet
import umap
import scanpy as sc
import pickle
import time
import itertools
import warnings

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score

## Load data

In [None]:
dataset = sc.read_h5ad(DATA_DIR /"hochgerner_2018.h5ad")

In [None]:
labels = dataset.obs.labels
data = dataset.X.toarray()

In [None]:
dataset

In [None]:
labels.value_counts()

In [None]:
labels_copy = labels.copy()

In [None]:
le = LabelEncoder()
le.fit(labels)
labels = le.transform(labels)

## Normalization of data

In [None]:
from scipy import stats

#z-score (normalization on cell level)
data = stats.zscore(data, axis=1)

## Non cross-validated analysis

### Split into training and test set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=.2, random_state=2022)

### Performance on unreduced dataset

#### SVM

In [None]:
%%time
svm = SVC(random_state=1)
svm.fit(x_train, y_train)
svm_raw_yhat = svm.predict(x_test)

In [None]:
svm_raw_accuracy = accuracy_score(y_test, svm_raw_yhat)
svm_raw_f1 = f1_score(y_test, svm_raw_yhat, average="macro")
(svm_raw_accuracy, svm_raw_f1)

In [None]:
with open('v2_output/svm_raw_acc.pkl', 'wb') as f:
    pickle.dump(svm_raw_accuracy, f)
    
with open('v2_output/svm_raw_f1.pkl', 'wb') as f:
    pickle.dump(svm_raw_f1, f)

In [None]:
with open('v2_output/svm_raw_yhat.pkl', 'wb') as f:
    pickle.dump(svm_raw_yhat, f)

#### MLR

In [None]:
%%time
mlr = LogisticRegression(random_state=1)
mlr.fit(x_train, y_train)
mlr_raw_yhat = mlr.predict(x_test)

In [None]:
mlr_raw_accuracy = accuracy_score(y_test, mlr_raw_yhat)
mlr_raw_f1 = f1_score(y_test, mlr_raw_yhat, average="macro")
(mlr_raw_accuracy, mlr_raw_f1)

In [None]:
with open('v2_output/mlr_raw_acc.pkl', 'wb') as f:
    pickle.dump(mlr_raw_accuracy, f)
    
with open('v2_output/mlr_raw_f1.pkl', 'wb') as f:
    pickle.dump(mlr_raw_f1, f)

In [None]:
with open('v2_output/mlr_raw_yhat.pkl', 'wb') as f:
    pickle.dump(mlr_raw_yhat, f)

#### LDA

In [None]:
%%time
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
lda_raw_yhat = lda.predict(x_test)

In [None]:
lda_raw_accuracy = accuracy_score(y_test, lda_raw_yhat)
lda_raw_f1 = f1_score(y_test, lda_raw_yhat, average="macro")
(lda_raw_accuracy, lda_raw_f1)

In [None]:
with open('v2_output/lda_raw_acc.pkl', 'wb') as f:
    pickle.dump(lda_raw_accuracy, f)
    
with open('v2_output/lda_raw_f1.pkl', 'wb') as f:
    pickle.dump(lda_raw_f1, f)

In [None]:
with open('v2_output/lda_raw_yhat.pkl', 'wb') as f:
    pickle.dump(lda_raw_yhat, f)

#### QDA

In [None]:
%%time
qda = QuadraticDiscriminantAnalysis()
qda.fit(x_train, y_train)
qda_raw_yhat = qda.predict(x_test)

In [None]:
qda_raw_accuracy = accuracy_score(y_test, qda_raw_yhat)
qda_raw_f1 = f1_score(y_test, qda_raw_yhat, average="macro")
(qda_raw_accuracy, qda_raw_f1)

In [None]:
with open('v2_output/qda_raw_acc.pkl', 'wb') as f:
    pickle.dump(qda_raw_accuracy, f)
    
with open('v2_output/qda_raw_f1.pkl', 'wb') as f:
    pickle.dump(qda_raw_f1, f)

In [None]:
with open('v2_output/qda_raw_yhat.pkl', 'wb') as f:
    pickle.dump(qda_raw_yhat, f)

#### KNN

In [None]:
%%time
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
knn_raw_yhat = knn.predict(x_test)

In [None]:
knn_raw_accuracy = accuracy_score(y_test, knn_raw_yhat)
knn_raw_f1 = f1_score(y_test, knn_raw_yhat, average="macro")
(knn_raw_accuracy, knn_raw_f1)

In [None]:
with open('v2_output/knn5_raw_acc.pkl', 'wb') as f:
    pickle.dump(knn_raw_accuracy, f)
    
with open('v2_output/knn5_raw_f1.pkl', 'wb') as f:
    pickle.dump(knn_raw_f1, f)

In [None]:
with open('v2_output/knn5_raw_yhat.pkl', 'wb') as f:
    pickle.dump(knn_raw_yhat, f)

#### RF

In [None]:
%%time
rf = RandomForestClassifier(random_state=1)
rf.fit(x_train, y_train)
rf_raw_yhat = rf.predict(x_test)

In [None]:
rf_raw_accuracy = accuracy_score(y_test, rf_raw_yhat)
rf_raw_f1 = f1_score(y_test, rf_raw_yhat, average="macro")
(rf_raw_accuracy, rf_raw_f1)

In [None]:
with open('v2_output/rf_raw_acc.pkl', 'wb') as f:
    pickle.dump(rf_raw_accuracy, f)
    
with open('v2_output/rf_raw_f1.pkl', 'wb') as f:
    pickle.dump(rf_raw_f1, f)

In [None]:
with open('v2_output/rf_raw_yhat.pkl', 'wb') as f:
    pickle.dump(rf_raw_yhat, f)

### Performance on PCA embeddings

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=40)
pca.fit(x_train)

In [None]:
train_comp = pca.transform(x_train)

In [None]:
test_comp = pca.transform(x_test)

In [None]:
pc40 = (train_comp, test_comp)

In [None]:
with open('v2_output/pc40.pkl', 'wb') as f:
    pickle.dump(pc40, f)

#### SVM/MLR/LDA/QDA

In [None]:
svm_pca_acc = []
svm_pca_f1 = []

mlr_pca_acc = []
mlr_pca_f1 = []

lda_pca_acc = []
lda_pca_f1 = []

qda_pca_acc = []
qda_pca_f1 = []

In [None]:
def compute_accuracy_pca(pc, y_train, y_test, acc, f1, classifier):
    
    #select classifier
    if classifier == "svm":
        svm = SVC(random_state=1)
        svm.fit(pc[0], y_train)
        raw_yhat = svm.predict(pc[1])
        
    if classifier == "mlr":
        mlr = LogisticRegression(random_state=1)
        mlr.fit(pc[0], y_train)
        raw_yhat = mlr.predict(pc[1])
        
    if classifier == "lda":
        lda = LinearDiscriminantAnalysis()
        lda.fit(pc[0], y_train)
        raw_yhat = lda.predict(pc[1])
        
    if classifier == "qda":
        qda = QuadraticDiscriminantAnalysis()
        qda.fit(pc[0], y_train)
        raw_yhat = qda.predict(pc[1])
    
    acc.append(accuracy_score(y_test, raw_yhat))
    f1.append(f1_score(y_test, raw_yhat, average="macro"))

In [None]:
%%time
compute_accuracy_pca(pc1, y_train, y_test, svm_pca_acc, svm_pca_f1, "svm")
compute_accuracy_pca(pc2, y_train, y_test, svm_pca_acc, svm_pca_f1, "svm")
compute_accuracy_pca(pc10, y_train, y_test, svm_pca_acc, svm_pca_f1, "svm")
compute_accuracy_pca(pc25, y_train, y_test, svm_pca_acc, svm_pca_f1, "svm")
compute_accuracy_pca(pc40, y_train, y_test, svm_pca_acc, svm_pca_f1, "svm")

In [None]:
with open('v2_output/svm_pca_acc.pkl', 'wb') as f:
    pickle.dump(svm_pca_acc, f)

with open('v2_output/svm_pca_f1.pkl', 'wb') as f:
    pickle.dump(svm_pca_f1, f)

In [None]:
%%time
compute_accuracy_pca(pc1, y_train, y_test, mlr_pca_acc, mlr_pca_f1, "mlr")
compute_accuracy_pca(pc2, y_train, y_test, mlr_pca_acc, mlr_pca_f1, "mlr")
compute_accuracy_pca(pc10, y_train, y_test, mlr_pca_acc, mlr_pca_f1, "mlr")
compute_accuracy_pca(pc25, y_train, y_test, mlr_pca_acc, mlr_pca_f1, "mlr")
compute_accuracy_pca(pc40, y_train, y_test, mlr_pca_acc, mlr_pca_f1, "mlr")

In [None]:
with open('v2_output/mlr_pca_acc.pkl', 'wb') as f:
    pickle.dump(mlr_pca_acc, f)

with open('v2_output/mlr_pca_f1.pkl', 'wb') as f:
    pickle.dump(mlr_pca_f1, f)

In [None]:
%%time
compute_accuracy_pca(pc1, y_train, y_test, lda_pca_acc, lda_pca_f1, "lda")
compute_accuracy_pca(pc2, y_train, y_test, lda_pca_acc, lda_pca_f1, "lda")
compute_accuracy_pca(pc10, y_train, y_test, lda_pca_acc, lda_pca_f1, "lda")
compute_accuracy_pca(pc25, y_train, y_test, lda_pca_acc, lda_pca_f1, "lda")
compute_accuracy_pca(pc40, y_train, y_test, lda_pca_acc, lda_pca_f1, "lda")

In [None]:
with open('v2_output/lda_pca_acc.pkl', 'wb') as f:
    pickle.dump(lda_pca_acc, f)

with open('v2_output/lda_pca_f1.pkl', 'wb') as f:
    pickle.dump(lda_pca_f1, f)

In [None]:
%%time
compute_accuracy_pca(pc1, y_train, y_test, qda_pca_acc, qda_pca_f1, "qda")
compute_accuracy_pca(pc2, y_train, y_test, qda_pca_acc, qda_pca_f1, "qda")
compute_accuracy_pca(pc10, y_train, y_test, qda_pca_acc, qda_pca_f1, "qda")
compute_accuracy_pca(pc25, y_train, y_test, qda_pca_acc, qda_pca_f1, "qda")
compute_accuracy_pca(pc40, y_train, y_test, qda_pca_acc, qda_pca_f1, "qda")

In [None]:
with open('v2_output/qda_pca_acc.pkl', 'wb') as f:
    pickle.dump(qda_pca_acc, f)

with open('v2_output/qda_pca_f1.pkl', 'wb') as f:
    pickle.dump(qda_pca_f1, f)

#### RF

In [None]:
rf_pca_accuracy = []
rf_pca_f1 = []

In [None]:
def compute_accuracy_pca(pc, y_train, y_test, acc, f1):

    from sklearn.ensemble import RandomForestClassifier

    rf = RandomForestClassifier(random_state=1)
    rf.fit(pc[0], y_train)
    rf_raw_yhat = rf.predict(pc[1])
    
    acc.append(accuracy_score(y_test, rf_raw_yhat))
    f1.append(f1_score(y_test, rf_raw_yhat, average="macro"))

In [None]:
%%time
compute_accuracy_pca(pc1, y_train, y_test, rf_pca_accuracy, rf_pca_f1)
compute_accuracy_pca(pc2, y_train, y_test, rf_pca_accuracy, rf_pca_f1)
compute_accuracy_pca(pc10, y_train, y_test, rf_pca_accuracy, rf_pca_f1)
compute_accuracy_pca(pc25, y_train, y_test, rf_pca_accuracy, rf_pca_f1)
compute_accuracy_pca(pc40, y_train, y_test, rf_pca_accuracy, rf_pca_f1)

#### KNN

In [None]:
knn_pca_accuracy = []
knn_pca_f1 = []

In [None]:
def compute_accuracy_pca(pc, y_train, y_test, acc, f1):
    
    from sklearn.neighbors import KNeighborsClassifier

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(pc[0], y_train)
    knn_raw_yhat = knn.predict(pc[1])

    acc.append(accuracy_score(y_test, knn_raw_yhat))
    f1.append(f1_score(y_test, knn_raw_yhat, average="macro"))  

In [None]:
compute_accuracy_pca(pc1, y_train, y_test, knn_pca_accuracy, knn_pca_f1)
compute_accuracy_pca(pc2, y_train, y_test, knn_pca_accuracy, knn_pca_f1)
compute_accuracy_pca(pc10, y_train, y_test, knn_pca_accuracy, knn_pca_f1)
compute_accuracy_pca(pc25, y_train, y_test, knn_pca_accuracy, knn_pca_f1)
compute_accuracy_pca(pc40, y_train, y_test, knn_pca_accuracy, knn_pca_f1)

In [None]:
with open('v2_output/rf_pca_acc.pkl', 'wb') as f:
    pickle.dump(rf_pca_accuracy, f)

In [None]:
with open('v2_output/rf_pca_f1.pkl', 'wb') as f:
    pickle.dump(rf_pca_f1, f)

### Performance on UMAP embeddings

#### Compute embeddings

In [None]:
#compute training and test embeddings
#embeddings are stored in dict_emb

def compute_embeddings(x_train, x_test, dimensions, neigbors, seed, dict_emb):
    
    
    embedder = umap.UMAP(n_neighbors=neigbors, n_components=dimensions, random_state=seed).fit(x_train)
    
    train_embedding = embedder.embedding_
    
    test_embedding = embedder.transform(x_test)
    
    key = "d"+str(dimensions)+"n"+str(neigbors)
    
    dict_emb[key] = (train_embedding, test_embedding)
    
    
    print(key)
    

In [None]:
import random
random.seed(2678136)

#Generate 20 random numbers and list them
seed_list = random.sample(range(10**0, 10**6), 20)

#for Hochgerner only one seed is used because it takes too long to repeat for multiple seed numbers
seed=[seed_list[0]]

In [None]:
dim=[1,2,10,25,40]
neigh=[2, 5, 15, 25, 50, 75, 100]

In [None]:
#create combinations of number of dimensions and number of neighbors 
a = [dim,neigh,seed]
test_paramters=list(itertools.product(*a))

In [None]:
test_paramters

In [None]:
#create empty dictionary to store embeddings
dict_emb = {}

In [None]:
%%time
for i in range(len(test_paramters)):
    compute_embeddings(x_train, x_test, test_paramters[i][0], test_paramters[i][1],
                       test_paramters[i][2], dict_emb)

In [None]:
#save dictionary which contains a training embedding and test embeddind for every hyperparameter combination
with open('v2_output/Hochgerner_emb_0.pkl', 'wb') as f:
    pickle.dump(dict_emb, f)

#### Compute accuracy and F1-score

##### SVM/MLR/LDA/QDA

In [None]:
#create empty dictionaries to store accuracy and F1 scores

dict_svm_acc = {}
dict_mlr_acc = {}
dict_lda_acc = {}
dict_qda_acc = {}

dict_svm_f1 = {}
dict_mlr_f1 = {}
dict_lda_f1 = {}
dict_qda_f1 = {}

In [None]:
#fit classification models and compute accuracy and f1 scores

def compute_accuracy_scores(key, train_embedding, test_embedding, y_train, y_test, 
                            svm_acc, mlr_acc, lda_acc, qda_acc, 
                            svm_f1, mlr_f1, lda_f1, qda_f1):
    

    
    #fit classifiers
    
    svm_umap = SVC(random_state=1)
    svm_umap.fit(train_embedding, y_train)
    svm_yhat = svm_umap.predict(test_embedding)
    
    mlr_umap = LogisticRegression(random_state=1)
    mlr_umap.fit(train_embedding, y_train)
    mlr_yhat = mlr_umap.predict(test_embedding)
    
    lda_umap = LinearDiscriminantAnalysis()
    lda_umap.fit(train_embedding, y_train)
    lda_yhat = lda_umap.predict(test_embedding)
    
    qda_umap = QuadraticDiscriminantAnalysis()
    qda_umap.fit(train_embedding, y_train)
    qda_yhat = qda_umap.predict(test_embedding)
    

    #compute scores
    
    svm_acc[key] = [accuracy_score(y_test, svm_yhat)]
    svm_f1[key] = [f1_score(y_test, svm_yhat, average="macro")]

    mlr_acc[key] = [accuracy_score(y_test, mlr_yhat)]
    mlr_f1[key] = [f1_score(y_test, mlr_yhat, average="macro")]

    lda_acc[key] = [accuracy_score(y_test, lda_yhat)]
    lda_f1[key] = [f1_score(y_test, lda_yhat, average="macro")]

    qda_acc[key] = [accuracy_score(y_test, qda_yhat)]
    qda_f1[key] = [f1_score(y_test, qda_yhat, average="macro")]


In [None]:
#fit classification models and compute accuracy and f1 scores

def compute_accuracy_scores(key, train_embedding, test_embedding, y_train, y_test, 
                            svm_acc, mlr_acc, lda_acc, qda_acc, 
                            svm_f1, mlr_f1, lda_f1, qda_f1):

    
    #fit classifiers
    
    svm_umap = SVC(random_state=1)
    svm_umap.fit(train_embedding, y_train)
    svm_yhat = svm_umap.predict(test_embedding)
    
    mlr_umap = LogisticRegression(random_state=1)
    mlr_umap.fit(train_embedding, y_train)
    mlr_yhat = mlr_umap.predict(test_embedding)
    
    lda_umap = LinearDiscriminantAnalysis()
    lda_umap.fit(train_embedding, y_train)
    lda_yhat = lda_umap.predict(test_embedding)
    
    qda_umap = QuadraticDiscriminantAnalysis()
    qda_umap.fit(train_embedding, y_train)
    qda_yhat = qda_umap.predict(test_embedding)
    

 
    #compute scores

    svm_acc[key].append(accuracy_score(y_test, svm_yhat))
    svm_f1[key].append(f1_score(y_test, svm_yhat, average="macro"))

    mlr_acc[key].append(accuracy_score(y_test, mlr_yhat))
    mlr_f1[key].append(f1_score(y_test, mlr_yhat, average="macro"))

    qda_acc[key].append(accuracy_score(y_test, qda_yhat))
    qda_f1[key].append(f1_score(y_test, qda_yhat, average="macro"))

    lda_acc[key].append(accuracy_score(y_test, lda_yhat))
    lda_f1[key].append(f1_score(y_test, lda_yhat, average="macro"))


In [None]:
with open('v2_output/Hochgerner_emb_0.pkl', 'rb') as f:
    dict_emb = pickle.load(f)

In [None]:
test_parameters_accuracy = list(dict_emb.keys())
test_parameters_accuracy

In [None]:
#run function for every hyperparameter combinations
for i in range(len(test_parameters_accuracy)):
    compute_accuracy_scores(test_parameters_accuracy[i], dict_emb[test_parameters_accuracy[i]][0], 
                       dict_emb[test_parameters_accuracy[i]][1], 
                       y_train, y_test, 
                       dict_svm_acc, dict_mlr_acc, dict_lda_acc, dict_qda_acc, 
                        dict_svm_f1, dict_mlr_f1, dict_lda_f1, dict_qda_f1)


In [None]:
#save dictionaries which stores performance scores for each hyperparameter combination

with open('v2_output/svm_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_svm_acc, f)
with open('v2_output/svm_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_svm_f1, f)
    
with open('v2_output/mlr_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_mlr_acc, f)
with open('v2_output/mlr_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_mlr_f1, f)
    
with open('v2_output/lda_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_lda_acc, f)
with open('v2_output/lda_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_lda_f1, f)
    
with open('v2_output/qda_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_qda_acc, f)
with open('v2_output/qda_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_qda_f1, f)


##### RF/KNN

In [None]:
#rf+KNN
dict_rf_acc = {}
dict_knn2_acc = {}
dict_knn5_acc = {}
dict_knn15_acc = {}
dict_knn25_acc = {}
dict_knn50_acc = {}
dict_knn75_acc = {}
dict_knn100_acc = {}

In [None]:
#rf+KNN
dict_rf_f1 = {}
dict_knn2_f1 = {}
dict_knn5_f1 = {}
dict_knn15_f1 = {}
dict_knn25_f1 = {}
dict_knn50_f1 = {}
dict_knn75_f1 = {}
dict_knn100_f1 = {}

In [None]:
#fit classification models and compute accuracy and f1 scores


def compute_accuracy_scores(key, train_embedding, test_embedding, y_train, y_test, 
                            rf_acc, knn2_acc, knn5_acc, knn15_acc, 
                            knn25_acc, knn50_acc, knn75_acc, knn100_acc,
                            rf_f1, knn2_f1, knn5_f1, knn15_f1, 
                            knn25_f1, knn50_f1, knn75_f1, knn100_f1):
    

    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier
    
    #fit classifiers
    
    rf_umap = RandomForestClassifier(random_state=1)
    rf_umap.fit(train_embedding, y_train)
    rf_yhat = rf_umap.predict(test_embedding)
    
    knn2_umap = KNeighborsClassifier(n_neighbors=2)
    knn2_umap.fit(train_embedding, y_train)
    knn2_yhat = knn2_umap.predict(test_embedding)
    
    knn5_umap = KNeighborsClassifier(n_neighbors=5)
    knn5_umap.fit(train_embedding, y_train)
    knn5_yhat = knn5_umap.predict(test_embedding)
    
    knn15_umap = KNeighborsClassifier(n_neighbors=15)
    knn15_umap.fit(train_embedding, y_train)
    knn15_yhat = knn15_umap.predict(test_embedding)
    
    knn25_umap = KNeighborsClassifier(n_neighbors=25)
    knn25_umap.fit(train_embedding, y_train)
    knn25_yhat = knn25_umap.predict(test_embedding)
    
    knn50_umap = KNeighborsClassifier(n_neighbors=50)
    knn50_umap.fit(train_embedding, y_train)
    knn50_yhat = knn50_umap.predict(test_embedding)

    knn75_umap = KNeighborsClassifier(n_neighbors=75)
    knn75_umap.fit(train_embedding, y_train)
    knn75_yhat = knn75_umap.predict(test_embedding)

    knn100_umap = KNeighborsClassifier(n_neighbors=100)
    knn100_umap.fit(train_embedding, y_train)
    knn100_yhat = knn100_umap.predict(test_embedding)

 
    #compute scores
    
    rf_acc[key] = [accuracy_score(y_test, rf_yhat)]
    rf_f1[key] = [f1_score(y_test, rf_yhat, average="macro")]

    knn2_acc[key] = [accuracy_score(y_test, knn2_yhat)]
    knn2_f1[key] = [f1_score(y_test, knn2_yhat, average="macro")]

    knn5_acc[key] = [accuracy_score(y_test, knn5_yhat)]
    knn5_f1[key] = [f1_score(y_test, knn5_yhat, average="macro")]

    knn15_acc[key] = [accuracy_score(y_test, knn15_yhat)]
    knn15_f1[key] = [f1_score(y_test, knn15_yhat, average="macro")]

    knn25_acc[key] = [accuracy_score(y_test, knn25_yhat)]
    knn25_f1[key] = [f1_score(y_test, knn25_yhat, average="macro")]

    knn50_acc[key] = [accuracy_score(y_test, knn50_yhat)]
    knn50_f1[key] = [f1_score(y_test, knn50_yhat, average="macro")]

    knn75_acc[key] = [accuracy_score(y_test, knn75_yhat)]
    knn75_f1[key] = [f1_score(y_test, knn75_yhat, average="macro")]

    knn100_acc[key] = [accuracy_score(y_test, knn100_yhat)]
    knn100_f1[key] = [f1_score(y_test, knn100_yhat, average="macro")]



In [None]:
#fit classification models and compute accuracy and f1 scores

def compute_accuracy_scores(key, train_embedding, test_embedding, y_train, y_test, 
                            rf_acc, knn2_acc, knn5_acc, knn15_acc, 
                            knn25_acc, knn50_acc, knn75_acc, knn100_acc,
                            rf_f1, knn2_f1, knn5_f1, knn15_f1, 
                            knn25_f1, knn50_f1, knn75_f1, knn100_f1):
    

    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier
    
    #fit classifiers
    
    rf_umap = RandomForestClassifier(random_state=1)
    rf_umap.fit(train_embedding, y_train)
    rf_yhat = rf_umap.predict(test_embedding)
    
    knn2_umap = KNeighborsClassifier(n_neighbors=2)
    knn2_umap.fit(train_embedding, y_train)
    knn2_yhat = knn2_umap.predict(test_embedding)
    
    knn5_umap = KNeighborsClassifier(n_neighbors=5)
    knn5_umap.fit(train_embedding, y_train)
    knn5_yhat = knn5_umap.predict(test_embedding)
    
    knn15_umap = KNeighborsClassifier(n_neighbors=15)
    knn15_umap.fit(train_embedding, y_train)
    knn15_yhat = knn15_umap.predict(test_embedding)
    
    knn25_umap = KNeighborsClassifier(n_neighbors=25)
    knn25_umap.fit(train_embedding, y_train)
    knn25_yhat = knn25_umap.predict(test_embedding)
    
    knn50_umap = KNeighborsClassifier(n_neighbors=50)
    knn50_umap.fit(train_embedding, y_train)
    knn50_yhat = knn50_umap.predict(test_embedding)

    knn75_umap = KNeighborsClassifier(n_neighbors=75)
    knn75_umap.fit(train_embedding, y_train)
    knn75_yhat = knn75_umap.predict(test_embedding)

    knn100_umap = KNeighborsClassifier(n_neighbors=100)
    knn100_umap.fit(train_embedding, y_train)
    knn100_yhat = knn100_umap.predict(test_embedding)

 
    #compute scores

    rf_acc[key].append(accuracy_score(y_test, rf_yhat))
    rf_f1[key].append(f1_score(y_test, rf_yhat, average="macro"))

    knn2_acc[key].append(accuracy_score(y_test, knn2_yhat))
    knn2_f1[key].append(f1_score(y_test, knn2_yhat, average="macro"))

    knn5_acc[key].append(accuracy_score(y_test, knn5_yhat))
    knn5_f1[key].append(f1_score(y_test, knn5_yhat, average="macro"))

    knn15_acc[key].append(accuracy_score(y_test, knn15_yhat))
    knn15_f1[key].append(f1_score(y_test, knn15_yhat, average="macro"))

    knn25_acc[key].append(accuracy_score(y_test, knn25_yhat))
    knn25_f1[key].append(f1_score(y_test, knn25_yhat, average="macro"))

    knn50_acc[key].append(accuracy_score(y_test, knn50_yhat))
    knn50_f1[key].append(f1_score(y_test, knn50_yhat, average="macro"))

    knn75_acc[key].append(accuracy_score(y_test, knn75_yhat))
    knn75_f1[key].append(f1_score(y_test, knn75_yhat, average="macro"))

    knn100_acc[key].append(accuracy_score(y_test, knn100_yhat))
    knn100_f1[key].append(f1_score(y_test, knn100_yhat, average="macro"))

In [None]:
test_parameters_accuracy = list(dict_emb.keys())
test_parameters_accuracy

In [None]:
with open('v2_output/Hochgerner_emb_0.pkl', 'rb') as f:
    dict_emb = pickle.load(f)

In [None]:
for i in range(len(test_parameters_accuracy)):
    compute_accuracy_scores(test_parameters_accuracy[i], dict_emb[test_parameters_accuracy[i]][0], 
                       dict_emb[test_parameters_accuracy[i]][1], 
                       y_train, y_test, 
                       dict_rf_acc, dict_knn2_acc, dict_knn5_acc, dict_knn15_acc, 
                        dict_knn25_acc, dict_knn50_acc, dict_knn75_acc, dict_knn100_acc,
                        dict_rf_f1, dict_knn2_f1, dict_knn5_f1, dict_knn15_f1, 
                        dict_knn25_f1, dict_knn50_f1, dict_knn75_f1, dict_knn100_f1)


In [None]:
with open('v2_output/rf_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_rf_acc, f)
with open('v2_output/rf_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_rf_f1, f)
    
with open('v2_output/knn2_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_knn2_acc, f)
with open('v2_output/knn2_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_knn2_f1, f)
    
with open('v2_output/knn5_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_knn5_acc, f)
with open('v2_output/knn5_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_knn5_f1, f)
    
with open('v2_output/knn15_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_knn15_acc, f)
with open('v2_output/knn15_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_knn15_f1, f)
    
with open('v2_output/knn25_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_knn25_acc, f)
with open('v2_output/knn25_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_knn25_f1, f)
    
with open('v2_output/knn50_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_knn50_acc, f)
with open('v2_output/knn50_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_knn50_f1, f)
    
with open('v2_output/knn75_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_knn75_acc, f)
with open('v2_output/knn75_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_knn75_f1, f)
    
with open('v2_output/knn100_umap_acc.pkl', 'wb') as f:
    pickle.dump(dict_knn100_acc, f)
with open('v2_output/knn100_umap_f1.pkl', 'wb') as f:
    pickle.dump(dict_knn100_f1, f)


### Plot results

In [None]:
def create_table(scores):
    values = {
        'n2': [scores['d1n2'], scores['d2n2'], scores['d10n2'], scores['d25n2'], scores['d40n2']],
        'n5': [scores['d1n5'], scores['d2n5'], scores['d10n5'], scores['d25n5'], scores['d40n5']],
        'n15': [scores['d1n15'], scores['d2n15'], scores['d10n15'], scores['d25n15'], scores['d40n15']],
        'n25': [scores['d1n25'], scores['d2n25'], scores['d10n25'], scores['d25n25'], scores['d40n25']],
        'n50': [scores['d1n50'], scores['d2n50'], scores['d10n50'],scores['d25n50'], scores['d40n50']],
        'n75': [scores['d1n75'], scores['d2n75'], scores['d10n75'], scores['d25n75'], scores['d40n75']],
        'n100': [scores['d1n100'], scores['d2n100'], scores['d10n100'], scores['d25n100'], scores['d40n100']]
                }
    
    table = pd.DataFrame(values, index=['d1','d2','d10','d25','d40'])

    return table

In [None]:
def plot(table, accuracy_raw, accuracy_pca, name, path):
    
    t=np.array([2, 5, 15, 25, 50, 75, 100])
    #labels=['d1','d2','d10', 'd25', 'd40', 'raw']
    labels=['d1','d2','d10', 'd25', 'd40', 'pca1','pca2','pca10', 'pca25', 'pca40', 'raw']
    
    plt.figure(figsize = (7,4) )
    ax = plt.gca()
    ax.plot(t, list(chain.from_iterable(table.iloc[0,:])), color = 'red', label=labels[0], lw=1.5, marker='.')
    
    ax.plot(t, list(chain.from_iterable(table.iloc[1,:])), color = 'orange', label=labels[1], lw=1.5, marker='.')
    
    ax.plot(t, list(chain.from_iterable(table.iloc[2,:])), color = 'green', label=labels[2], lw=1.5, marker='.')
    
    ax.plot(t, list(chain.from_iterable(table.iloc[3,:])), color = 'dodgerblue', label=labels[3], lw=1.5, marker='.')
    
    ax.plot(t, list(chain.from_iterable(table.iloc[4,:])), color = 'violet', label=labels[4], lw=1.5, marker='.')
    
    plt.axhline(y=accuracy_pca[0], color='red', linestyle='--', label=labels[5])
    plt.axhline(y=accuracy_pca[1], color='orange', linestyle='--', label=labels[6])
    plt.axhline(y=accuracy_pca[2], color='green', linestyle='--', label=labels[7])
    plt.axhline(y=accuracy_pca[3], color='dodgerblue', linestyle='--', label=labels[8])
    plt.axhline(y=accuracy_pca[4], color='violet', linestyle='--', label=labels[9])
    
    plt.axhline(y=accuracy_raw, color='gray', linestyle='-.', label=labels[10])
    
    #plt.ylim(0.5, 1.01)
    plt.ylim(0, 1.01)
    
    plt.xlabel("Number of neigbors")
    #plt.ylabel("Accuracy")
    plt.ylabel("F1 score")
    #legend_outside = plt.legend(bbox_to_anchor=(1.20,0.79), loc='right')
    #plt.title(name)
    
    plt.savefig(path, facecolor='white', bbox_inches='tight', dpi=300)
    #plt.show()    
    

In [None]:
with open('v2_output/qda_raw_f1.pkl', 'rb') as f:
    raw = pickle.load(f)

In [None]:
with open('v2_output/qda_pca_f1.pkl', 'rb') as f:
    pca = pickle.load(f)

In [None]:
with open('v2_output/qda_umap_f1.pkl', 'rb') as f:
    table_dict = pickle.load(f)

In [None]:
table_dict = create_table(table_dict)

In [None]:
plot(table_dict, raw, pca, '-', 'v2_output/qda_plot_f1_w')

## Figures

In [None]:
y_testt = le.inverse_transform(y_test)

In [None]:
with open('v2_output/pc2.pkl', 'rb') as f:
    pc = pickle.load(f)

In [None]:
with open('v2_output/Hochgerner_emb_0.pkl', 'rb') as f:
    emb = pickle.load(f)

In [None]:
labels_legend = list(labels_copy.unique())
labels_legend

In [None]:
color = ['#00B7FF', '#004DFF', '#00FFFF', '#826400', '#580041', '#FF00FF', '#00FF00', '#C500FF', '#B4FFD7',  
 '#FFCA00', '#969600', '#B4A2FF', '#C20078', '#0000C1', '#FF8B00', '#FFC8FF', '#666666', '#FF0000',  
 '#CCCCCC', '#009E8F', '#D7A870', '#8200FF', '#960000', '#BBFF00', '#FFFF00', '#006F00']

In [None]:
#color.remove('#ffff9a')

In [None]:
import random
random.seed(3)
random.shuffle(labels_legend)

In [None]:
labels_legend

In [None]:
color_dict = {}
keys = labels_legend
values = color
for i in range(len(keys)):
    color_dict[keys[i]] = values[i]
print(color_dict)

### training vs test data

In [None]:
y_trainn = le.inverse_transform(y_train)

In [None]:
#subplots

# create figure
fig, axes = plt.subplots(1, 2, figsize=(20, 10), sharey=False)


# add subplots
#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[0], x=emb['d2n5'][0][:,0], y=emb['d2n5'][0][:,1], hue = y_trainn, palette=color_dict)
axes[0].axis('equal')
axes[0].set_title("Embedding of training data", fontsize=45)
axes[0].tick_params(axis='both', which='major', labelsize=22)

sns.scatterplot(ax=axes[1], x=emb['d2n5'][1][:,0], y=emb['d2n5'][1][:,1], hue = y_testt, palette=color_dict)
#axes[0].figure(figsize = (10,10) )
axes[1].axis('equal')
axes[1].set_title("Embedding of training data", fontsize=45)
axes[1].tick_params(axis='both', which='major', labelsize=22)

#hide legend in subplots
for ax in axes:
    ax.legend([],[], frameon=False)

    
# add legend
handles, labels = axes[0].get_legend_handles_labels()
#fig.legend(handles, labels, loc='upper center',  ncol=12, bbox_to_anchor=(0.5, -0.01))


fig.tight_layout()

plt.savefig('v2_output/prediction_plots/train_vs_test2', facecolor='white', bbox_inches='tight', dpi=100)


plt.show()

###  2 dimensions vs 3 dimensions

In [None]:
x = test_emb[:,0]
y = test_emb[:,1]
z = test_emb[:,2]

#### True labels

In [None]:
#subplots true

# create figure
fig, axes = plt.subplots(1, 3, figsize=(30, 10), sharey=False)


# add subplots
#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[0], x=x, y=y, hue = y_testt, palette=color_dict)
axes[0].axis('equal')
axes[0].set_title("dim 1 vs dim 2", fontsize=35)
axes[0].tick_params(axis='both', which='major', labelsize=22)

sns.scatterplot(ax=axes[1], x=y, y=z, hue = y_testt, palette=color_dict)
#axes[0].figure(figsize = (10,10) )
axes[1].axis('equal')
axes[1].set_title("dim 2 vs dim 3", fontsize=35)
axes[1].tick_params(axis='both', which='major', labelsize=22)

#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[2], x=x, y=z, hue = y_testt, palette=color_dict)
axes[2].axis('equal')
axes[2].set_title("dim 1 vs dim 3", fontsize=35)
axes[2].tick_params(axis='both', which='major', labelsize=22)



#hide legend in subplots
for ax in axes:
    ax.legend([],[], frameon=False)

    
# add legend
handles, labels = axes[0].get_legend_handles_labels()
#fig.legend(handles, labels, loc='lower center',  ncol=12, bbox_to_anchor=(0.5, -0.05))


fig.tight_layout()

plt.savefig('v2_output/prediction_plots/mlr_umap_d3n15_true_sub1', facecolor='white', bbox_inches='tight', dpi=None )


plt.show()

In [None]:
#3d true
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

x = test_emb[:,0]
y = test_emb[:,1]
z = test_emb[:,2]

df_3d = pd.DataFrame()
df_3d['x'] = x
df_3d['y'] = y
df_3d['z'] = z
df_3d['label'] = y_testt

# axes instance
fig = plt.figure(figsize=(10,10))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)


# plot
sc = ax.scatter(x, y, z, s= 30, marker='o', alpha=1,  edgecolors = 'white', linewidths = 0.5, 
                c=df_3d['label'].map(color_dict))
ax.set_xlabel('dim 1')
ax.set_ylabel('dim 2')
ax.set_zlabel('dim 3')



# legend
#plt.legend(*sc.legend_elements(), bbox_to_anchor=(0.1, 10))
plt.savefig('v2_output/prediction_plots/mlr_umap_d3n15_true_2', facecolor='white', bbox_inches='tight', dpi=100)

#### Predicted labels from d2n15

In [None]:
%%time
mlr_umap = LogisticRegression(random_state=1)
mlr_umap.fit(emb['d2n15'][0], y_train)
mlr_umap_yhat = mlr_umap.predict(emb['d2n15'][1])
mlr_umap_yhatt = le.inverse_transform(mlr_umap_yhat)

In [None]:
mlr_f1 = f1_score(y_test, mlr_umap_yhat, average="macro")
mlr_f1

In [None]:
#subplots d2n15 pred

# create figure
fig, axes = plt.subplots(1, 3, figsize=(30, 10), sharey=False)


# add subplots
#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[0], x=x, y=y, hue = mlr_umap_yhatt, palette=color_dict)
axes[0].axis('equal')
axes[0].set_title("dim 1 vs dim 2", fontsize=35)
axes[0].tick_params(axis='both', which='major', labelsize=22)

sns.scatterplot(ax=axes[1], x=y, y=z, hue = mlr_umap_yhatt, palette=color_dict)
#axes[0].figure(figsize = (10,10) )
axes[1].axis('equal')
axes[1].set_title("dim 2 vs dim 3", fontsize=35)
axes[1].tick_params(axis='both', which='major', labelsize=22)

#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[2], x=x, y=z, hue = mlr_umap_yhatt, palette=color_dict)
axes[2].axis('equal')
axes[2].set_title("dim 1 vs dim 3", fontsize=35)
axes[2].tick_params(axis='both', which='major', labelsize=22)




#hide legend in subplots
for ax in axes:
    ax.legend([],[], frameon=False)

    
# add legend
handles, labels = axes[0].get_legend_handles_labels()
#fig.legend(handles, labels, loc='lower center',  ncol=7, bbox_to_anchor=(0.5, -0.05))


fig.tight_layout()

plt.savefig('v2_output/prediction_plots/mlr_umap_d3n15_pred_d2n15_sub1', facecolor='white', bbox_inches='tight', dpi=None)


plt.show()

In [None]:
#3d d2n15 pred
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

x = test_emb[:,0]
y = test_emb[:,1]
z = test_emb[:,2]

df_3d = pd.DataFrame()
df_3d['x'] = x
df_3d['y'] = y
df_3d['z'] = z
df_3d['label'] = mlr_umap_yhatt

# axes instance
fig = plt.figure(figsize=(10,10))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)


# plot
sc = ax.scatter(x, y, z, s= 30, marker='o', alpha=1,  edgecolors = 'white', linewidths = 0.5, 
                c=df_3d['label'].map(color_dict))
ax.set_xlabel('dim 1')
ax.set_ylabel('dim 2')
ax.set_zlabel('dim 3')




# legend
#plt.legend(*sc.legend_elements(), bbox_to_anchor=(0.1, 10))
plt.savefig('v2_output/prediction_plots/mlr_umap_d3n15_pred_d2n15_2', facecolor='white', bbox_inches='tight', dpi=100)

#### Predicted labels from d3n15

In [None]:
%%time
embedder = umap.UMAP(n_components=3, n_neighbors=15, random_state=seed_list[0]).fit(x_train)
train_emb = embedder.embedding_
test_emb = embedder.transform(x_test)

In [None]:
with open('v2_output/3-dim/train_emb_d3n15.pkl', 'wb') as f:
    pickle.dump(train_emb, f)
with open('v2_output/3-dim/test_emb_d3n15.pkl', 'wb') as f:
    pickle.dump(test_emb, f)

In [None]:
with open('v2_output/3-dim/train_emb_d3n15.pkl', 'rb') as f:
    train_emb = pickle.load(f)

In [None]:
with open('v2_output/3-dim/test_emb_d3n15.pkl', 'rb') as f:
    test_emb = pickle.load(f)

In [None]:
%%time
mlr_umap = LogisticRegression(random_state=1)
mlr_umap.fit(train_emb, y_train)
mlr_umap_yhat = mlr_umap.predict(test_emb)
mlr_umap_yhatt = le.inverse_transform(mlr_umap_yhat)

In [None]:
#mlr_raw_accuracy = accuracy_score(y_test, mlr_umap_yhat)
mlr_f1 = f1_score(y_test, mlr_umap_yhat, average="macro")
mlr_f1

In [None]:
#subplots d3n15 pred

# create figure
fig, axes = plt.subplots(1, 3, figsize=(30, 10), sharey=False)


# add subplots
#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[0], x=x, y=y, hue = mlr_umap_yhatt, palette=color_dict)
axes[0].axis('equal')
axes[0].set_title("dim 1 vs dim 2", fontsize=35)
axes[0].tick_params(axis='both', which='major', labelsize=22)

sns.scatterplot(ax=axes[1], x=y, y=z, hue = mlr_umap_yhatt, palette=color_dict)
#axes[0].figure(figsize = (10,10) )
axes[1].axis('equal')
axes[1].set_title("dim 2 vs dim 3", fontsize=35)
axes[1].tick_params(axis='both', which='major', labelsize=22)

#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[2], x=x, y=z, hue = mlr_umap_yhatt, palette=color_dict)
axes[2].axis('equal')
axes[2].set_title("dim 1 vs dim 3", fontsize=35)
axes[2].tick_params(axis='both', which='major', labelsize=22)




#hide legend in subplots
for ax in axes:
    ax.legend([],[], frameon=False)

    
# add legend
handles, labels = axes[0].get_legend_handles_labels()
#fig.legend(handles, labels, loc='lower center',  ncol=10, bbox_to_anchor=(0.5, -0.05))


fig.tight_layout()

plt.savefig('v2_output/prediction_plots/mlr_umap_d3n15_pred_sub1', facecolor='white', bbox_inches='tight', dpi=None)


plt.show()

In [None]:
#3d d3n15 pred
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

x = test_emb[:,0]
y = test_emb[:,1]
z = test_emb[:,2]

df_3d = pd.DataFrame()
df_3d['x'] = x
df_3d['y'] = y
df_3d['z'] = z
df_3d['label'] = mlr_umap_yhatt

# axes instance
fig = plt.figure(figsize=(10,10))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)


# plot
sc = ax.scatter(x, y, z, s= 30, marker='o', alpha=1,  edgecolors = 'white', linewidths = 0.5, 
                c=df_3d['label'].map(color_dict))
ax.set_xlabel('dim 1')
ax.set_ylabel('dim 2')
ax.set_zlabel('dim 3')




# legend
#plt.legend(*sc.legend_elements(), bbox_to_anchor=(0.1, 10))
plt.savefig('v2_output/prediction_plots/mlr_umap_d3n15_pred', facecolor='white', bbox_inches='tight', dpi=100)

### RF vs LR PCA

In [None]:
rf_pca = RandomForestClassifier(random_state=1)
rf_pca.fit(pc[0], y_train)
rf_pca_yhat = rf_pca.predict(pc[1])
rf_pca_yhatt = le.inverse_transform(rf_pca_yhat)

In [None]:
mlr_pca = LogisticRegression(random_state=1)
mlr_pca.fit(pc[0], y_train)
mlr_pca_yhat = mlr_pca.predict(pc[1])
mlr_pca_yhatt = le.inverse_transform(mlr_pca_yhat)

In [None]:
#subplots RF vs LR PCA

# create figure
fig, axes = plt.subplots(1, 3, figsize=(30, 10), sharey=False)


# add subplots
#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[0], x=pc[1][:,0], y=pc[1][:,1], hue = y_testt, palette=color_dict)
axes[0].axis('equal')
axes[0].set_title("True labels", fontsize=35)
axes[0].tick_params(axis='both', which='major', labelsize=22)

sns.scatterplot(ax=axes[1], x=pc[1][:,0], y=pc[1][:,1], hue = rf_pca_yhatt, palette=color_dict)
#axes[0].figure(figsize = (10,10) )
axes[1].axis('equal')
axes[1].set_title("Predicted labels by Random Forest", fontsize=35)
axes[1].tick_params(axis='both', which='major', labelsize=22)

#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[2], x=pc[1][:,0], y=pc[1][:,1], hue = mlr_pca_yhatt, palette=color_dict)
axes[2].axis('equal')
axes[2].set_title("Predicted labels by Logistic Regression", fontsize=35)
axes[2].tick_params(axis='both', which='major', labelsize=22)




#hide legend in subplots
for ax in axes:
    ax.legend([],[], frameon=False)

    
# add legend
handles, labels = axes[0].get_legend_handles_labels()
#fig.legend(handles, labels, loc='upper right', ncol=1, bbox_to_anchor=(0.94, 0.89))


fig.tight_layout()

plt.savefig('v2_output/prediction_plots/rf_mlr_pca1', facecolor='white', bbox_inches='tight', dpi=100)


plt.show()

### RF vs LR UMAP

In [None]:
%%time
rf_umap = RandomForestClassifier(random_state=1)
rf_umap.fit(emb['d2n15'][0], y_train)
rf_umap_yhat = rf_umap.predict(emb['d2n15'][1])
rf_umap_yhatt = le.inverse_transform(rf_umap_yhat)

In [None]:
rf_f1 = f1_score(y_test, rf_umap_yhat, average="macro")
rf_f1

In [None]:
%%time
mlr_umap = LogisticRegression(random_state=1)
mlr_umap.fit(emb['d2n15'][0], y_train)
mlr_umap_yhat = mlr_umap.predict(emb['d2n15'][1])
mlr_umap_yhatt = le.inverse_transform(mlr_umap_yhat)

In [None]:
mlr_f1 = f1_score(y_test, mlr_umap_yhat, average="macro")
mlr_f1

In [None]:
#subplots RF vs LR UMAP

# create figure
fig, axes = plt.subplots(1, 3, figsize=(30, 10), sharey=False)


# add subplots
#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[0], x=emb['d2n15'][1][:,0], y=emb['d2n15'][1][:,1], hue = y_testt, palette=color_dict)
axes[0].axis('equal')
axes[0].set_title("True labels", fontsize=35)
axes[0].tick_params(axis='both', which='major', labelsize=22)

sns.scatterplot(ax=axes[1], x=emb['d2n15'][1][:,0], y=emb['d2n15'][1][:,1], hue = rf_umap_yhatt, palette=color_dict)
#axes[0].figure(figsize = (10,10) )
axes[1].axis('equal')
axes[1].set_title("Predicted labels by Random Forest", fontsize=35)
axes[1].tick_params(axis='both', which='major', labelsize=22)

#plt.figure(figsize = (10,10) )
sns.scatterplot(ax=axes[2], x=emb['d2n15'][1][:,0], y=emb['d2n15'][1][:,1], hue = mlr_umap_yhatt, palette=color_dict)
axes[2].axis('equal')
axes[2].set_title("Predicted labels by Logistic Regression", fontsize=35)
axes[2].tick_params(axis='both', which='major', labelsize=22)




#hide legend in subplots
for ax in axes:
    ax.legend([],[], frameon=False)

    
# add legend
handles, labels = axes[0].get_legend_handles_labels()
#fig.legend(handles, labels, loc='lower center',  ncol=12, bbox_to_anchor=(0.5, -0.05))


fig.tight_layout()

plt.savefig('v2_output/prediction_plots/rf_mlr_umap_d2n15_1', facecolor='white', bbox_inches='tight', dpi=100)


plt.show()