### Normalization

In [None]:
from os import listdir
from os.path import isfile, join

data_fn = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 100/Data/"
onlyfiles = [f for f in listdir(data_fn) if isfile(join(data_fn, f))]

In [None]:
import torch
from torchvision.datasets.folder import pil_loader
import torchvision.transforms as transforms
from tqdm.notebook import tqdm

use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
transform = transforms.ToTensor()

sum = torch.Tensor([0., 0., 0.]).to(device)
sq_sum = torch.Tensor([0., 0., 0.]).to(device)
count = 0
for fn in tqdm(onlyfiles):
    image = transform(pil_loader(data_fn + fn)).to(device)
    _, n, m = image.shape
    sum += image.sum((1, 2))
    sq_sum += (image**2).sum((1, 2))
    count += n * m

In [None]:
means = sum/count
stds = torch.sqrt(sq_sum/count - means**2)

In [None]:
means, stds

### Δεδομένα στο Drive.

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

### Dataset

* transforms
* τροπος που φορτωνονται τα δεδομενα
* augmentation

In [None]:
import torch.utils.data
import torchvision.transforms as transforms
from torchvision.datasets.folder import pil_loader

class Artists(torch.utils.data.Dataset):
    def __init__(self, base_path, image_ids_fn, images_dir, train):
        self.base_path = base_path
        self.image_ids_fn = image_ids_fn
        self.images_dir = images_dir
        with open(base_path + image_ids_fn, 'r') as fp:
            rows = list(fp)
            self.fnames = [s.strip().split(',')[0] for s in rows[1:]]
            self.img_class_ids = [int(s.strip().split(',')[1]) for s in rows[1:]]
            self.img_ids = list(range(len(self.fnames)))
        self.train = train
        self.transform = {
            'train': transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.5138, 0.4915, 0.4315], [0.2675, 0.2572, 0.2626])
            ]),
            'val': transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize([0.5138, 0.4915, 0.4315], [0.2675, 0.2572, 0.2626])
            ]),
        }

    def __getitem__(self, index):
        img_fname = self.fnames[index]
        image = pil_loader(self.base_path + self.images_dir + img_fname)

        if self.transform is not None:
            if self.train:
                image = self.transform['train'](image)
            else:
                image = self.transform['val'](image)
        return image, self.img_class_ids[index]

    def __len__(self):
        return len(self.img_ids)


### Μοντέλο

* αρχιτεκτονικη
* βαθος cnn
* βαθος fc
* regularization
* fine-tuning/freeze


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

class ResNet34Small(nn.Module):

    def __init__(self, num_classes):
        super(ResNet34Small, self).__init__()
        original_model = models.resnet34(pretrained=True)
        self.features = nn.Sequential(*list(original_model.children())[:-3])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


class RegNet(nn.Module):

    def __init__(self, num_classes):
        super(RegNet, self).__init__()
        original_model = models.regnet_y_800mf(pretrained=True)
        self.features = nn.Sequential(original_model.stem, *list(original_model.trunk_output.children()))
        self.features.requires_grad = False
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Sequential(
            nn.LazyLinear(256),
            nn.LeakyReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

    def finetune(self, n_layers):
        for child in list((net.features[1]).children())[-n_layers:]:
            child.requires_grad = True


### Training

In [None]:
from torchvision.transforms.transforms import ToTensor
import numpy as np
from sklearn import metrics
from tqdm import tqdm
import torch.nn.functional as F
import torch
from torch.optim import lr_scheduler

import copy
import os
import argparse


def run(net, device, loader, optimizer, scheduler, split='val', epoch=0, train=False,
        dry_run=False):
    if train:
        net.train()
        torch.set_grad_enabled(True)
    else:
        net.eval()
        torch.set_grad_enabled(False)
    
    loader = tqdm(
        loader,
        ncols=0,
        desc='{1} E{0:02d}'.format(epoch, 'train' if train else 'val')
    )
    
    running_loss = 0
    preds_all = []
    labels_all = []
    for (imgs, img_class_ids) in loader:
        imgs, img_class_ids = (
            imgs.to(device), img_class_ids.to(device).long()
        )

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            output = net(imgs)
            loss = F.cross_entropy(output, img_class_ids, label_smoothing=1e-2)

        _, preds = torch.max(output, 1)

        if train:
            loss.backward()
            optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        labels_all.extend(img_class_ids.cpu().numpy())
        preds_all.extend(preds.cpu().numpy())
        
        if dry_run:
            break
    
    if train:
        scheduler.step()
        
    bal_acc = metrics.balanced_accuracy_score(labels_all, preds_all)

    print('Epoch: {}.. '.format(epoch),
        '{} Loss: {:.3f}.. '.format(split, running_loss / len(loader)),
        '{} Accuracy: {:.3f}.. '.format(split, bal_acc),
    )
    
    return running_loss / len(loader)


def train(net, base_path, train_ids_fn, val_ids_fn, images_dir, model_fname,
          batch_size=16, lr=1e-2, warmup=0, n_layers = 0, epochs=10,
          device='cpu', num_workers=6, dry_run=False):

    train_dataset = Artists(base_path, train_ids_fn, images_dir, True)
    val_dataset = Artists(base_path, val_ids_fn, images_dir, False)


    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=True,
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
    )
    
    if device == 'cuda':
        torch.backends.cudnn.benchmark = True

    cur_best_val_loss = np.inf

    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
        net.parameters()), lr=lr, momentum=0.9)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    
    for epoch in range(warmup):
        _ = run(net, device, train_loader, optimizer, scheduler, split='train',
                    epoch=epoch, train=True, dry_run=dry_run)
        val_loss = run(net, device, val_loader, optimizer, scheduler, split='val',
                    epoch=epoch, train=False, dry_run=dry_run)      
        if dry_run:
            break
    
    net.finetune(n_layers)
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
        net.parameters()), lr=lr, momentum=0.9)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    
    for epoch in range(epochs):
        _ = run(net, device, train_loader, optimizer, scheduler, split='train',
                    epoch=epoch, train=True, dry_run=dry_run)
        val_loss = run(net, device, val_loader, optimizer, scheduler, split='val',
                    epoch=epoch, train=False, dry_run=dry_run)

        if cur_best_val_loss > val_loss:
            if epoch > 0:
                # remove previous best model
                os.remove(model_fname)
            torch.save(net.state_dict(), model_fname)
            cur_best_val_loss = val_loss
        
        if dry_run:
            break

In [None]:
base_path = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 100/"
# base_path = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/"
train_ids_fname = "train_100.csv"
val_ids_fname = "val_100.csv"
images_dir = "Data/"
model_fname = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 100/resnet.pt"
# model_fname = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/resnet.pt"

use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

net = ResNet34Small(20).to(device)

train(net, base_path, train_ids_fname, val_ids_fname,
      images_dir, model_fname, device=device, warmup=2,
      epochs=10, lr=5e-5, batch_size=16, dry_run=False)

In [None]:
base_path = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 500/"
# base_path = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/"
train_ids_fname = "train_500.csv"
val_ids_fname = "val_500.csv"
images_dir = "Data/"
model_fname = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 500/regnet_800_finetuned.pt"
# model_fname = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/regnet.pt"

use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

net = RegNet(20).to(device)

train(net, base_path, train_ids_fname, val_ids_fname, images_dir,
      model_fname, device=device, warmup=3, n_layers=4,
      epochs=20, lr=1e-3, batch_size=16, dry_run=False)

In [None]:
base_path = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 100/"
# base_path = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/"
train_ids_fname = "train_100.csv"
val_ids_fname = "val_100.csv"
images_dir = "Data/"
model_fname = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 100/regnet_4layers.pt"
# model_fname = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/regnet.pt"

use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

net = RegNet(20).to(device)

train(net, base_path, train_ids_fname, val_ids_fname, images_dir,
      model_fname, device=device, warmup=2, n_layers=4,
      epochs=24, lr=5e-4, batch_size=16, dry_run=False)

In [None]:
print(torch.cuda.memory_summary())

### Feature extraction από CNN

In [None]:
import pickle

base_path = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/"
train_pkl = "train_100.pkl"
val_pkl = "val_100.pkl"
test_pkl = "test_100.pkl"
net = ResNet34Small(20)
checkpoint = torch.load("/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/resnet.pt")
net.load_state_dict(checkpoint)
with open(base_path + train_pkl,"rb") as f:
  train_dataset = pickle.load(f)
with open(base_path + test_pkl,"rb") as f:
  test_dataset = pickle.load(f)
with open(base_path + val_pkl,"rb") as f:
  val_dataset = pickle.load(f)

painting_features_train = [(net.features(x.unsqueeze(0)).flatten().numpy(), y) for x, y in train_dataset]
painting_features_val = [(net.features(x.unsqueeze(0).flatten()).numpy(),y) for x, y in val_dataset]
painting_features_test = [(net.features(x.unsqueeze(0).flatten()).numpy(),y) for x, y in test_dataset]
paintings = {"train":painting_features_train, "test":painting_features_test, "val":painting_features_val} 

In [None]:
output = open('painting_features.pkl', 'wb')
pickle.dump(datasets, output)
output.close()
!cp painting_features.pkl '/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/'

In [None]:
import torchvision.models as models

In [None]:
regnet_y_400mf = models.regnet_y_400mf(pretrained=True)

In [None]:
dir(regnet_y_400mf)

In [None]:
[x for x, _ in regnet_y_400mf.named_children()]

In [None]:
regnet_y_400mf.avgpool

In [None]:
[x for x, _ in regnet_y_400mf.trunk_output.named_children()]

In [None]:
list(regnet_y_400mf.trunk_output.block1[0].named_children())

In [None]:
regnet_y_400mf.stem

In [None]:
base_path = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 100/"
# base_path = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/"
train_ids_fn = "train_100.csv"
val_ids_fn = "val_100.csv"
images_dir = "Data/"
model_fname = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 100/regnet.pt"
train_dataset = Artists(base_path, train_ids_fn, images_dir, True)
val_dataset = Artists(base_path, val_ids_fn, images_dir, False)
batch_size = 16
num_workers = 8

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    shuffle=True,
)
val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    shuffle=False,
)

In [None]:
X, y = next(iter(train_loader))

In [None]:
X.shape

In [None]:
original_model = models.regnet_y_800mf()
len(list(original_model.trunk_output.named_children()))

In [None]:
base_path = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 500/"
# base_path = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/"
# train_ids_fname = "train_500.csv"
# val_ids_fname = "val_500.csv"
test_ids_fname = "test_500.csv"

images_dir = "Data/"
model_fname = "/home/jason/Desktop/projects/masters/ml/group_project/Dataset 500/regnet_800_long_train.pt"
# model_fname = "/content/gdrive/My Drive/art_recognition/Datasets/Dataset 100/regnet.pt"

use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

net = RegNet(20).to(device)
net.load_state_dict(torch.load('/home/jason/Desktop/projects/masters/ml/group_project/\
Dataset 500/regnet_800_finetuned.pt'))

# train_dataset = Artists(base_path, train_ids_fname, images_dir, False)
# val_dataset = Artists(base_path, val_ids_fname, images_dir, False)
test_dataset = Artists(base_path, test_ids_fname, images_dir, False)


# train_loader = torch.utils.data.DataLoader(
#     train_dataset,
#     batch_size=16,
#     num_workers=8,
#     shuffle=False,
# )
# val_loader = torch.utils.data.DataLoader(
#     val_dataset,
#     batch_size=16,
#     num_workers=8,
#     shuffle=False,
# )
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=16,
    num_workers=8,
    shuffle=False,
)
    

In [None]:
run(net, device, test_loader, None, None)

In [None]:
X, y = next(iter(test_loader))

In [None]:
X = X.to(device)

In [None]:
X_feat = torch.flatten(net.avgpool(net.features(X)), 1)

In [None]:
X_feat.shape

In [None]:
X_feat.to('cpu').detach().shape

In [None]:
_X_feat = X_feat.to('cpu').detach()

In [None]:
_X_feat = torch.cat((torch.Tensor(), X_feat.to('cpu').detach()))

In [None]:
_X_feat.shape

In [None]:
X_train, y_train = torch.Tensor(), torch.Tensor()
for X, y in train_loader:
    X = X.to(device)
    X = torch.flatten(net.avgpool(net.features(X)), 1)
    X_train = torch.cat((X_train, X.to('cpu').detach()))
    
    y_train = torch.cat((y_train, y))

In [None]:
X_test, y_test = torch.Tensor(), torch.Tensor()
for X, y in test_loader:
    X = X.to(device)
    X = torch.flatten(net.avgpool(net.features(X)), 1)
    X_test = torch.cat((X_test, X.to('cpu').detach()))  
    y_test = torch.cat((y_test, y))

In [None]:
X_test, y_test = X_test.numpy(), y_test.numpy()

In [None]:
y_test.shape

In [None]:
import pickle

with open("test_features.pickle", 'wb') as f:
    pickle.dump((X_test, y_test), f)

In [None]:
import pickle

with open("train_features.pickle", 'wb') as f:
    pickle.dump((X_train, y_train), f)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [None]:
from sklearn.dummy import DummyClassifier

clf = DummyClassifier()
strategy = ['stratified', 'most_frequent', 'prior', 'uniform']
param_grid = [
    {'clf__strategy': ['stratified', 'most_frequent', 'prior', 'uniform']},
    {'clf__strategy': ['constant'], 'clf__constant': [0, 1]},
]

pipe = Pipeline(steps=[('clf', clf)], memory='tmp')
estimator = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

In [None]:
estimator.fit(X_train, y_train)

In [None]:
pd.DataFrame(estimator.cv_results_)[
    ['mean_fit_time', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score']
].sort_values(by=['rank_test_score'])

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
param_grid = [{
    'clf__solver': ['liblinear'],
    'clf__penalty': ['l2', 'l1'],
    'clf__C': [0.1, 1.0, 10.0]
#     }, {
#     'clf__solver': ['lbfgs'],
#     'clf__penalty': ['l2'],
#     'clf__C': [0.1, 1.0, 10.0]
#     }, {
#     'clf__solver': ['lbfgs'],
#     'clf__penalty': ['none']
#     }, {
#     'clf__solver': ['sag'],
#     'clf__penalty': ['l2'],
#     'clf__C': [0.1, 1.0, 10.0]
#     }, {
#     'clf__solver': ['sag'],
#     'clf__penalty': ['none']
#     }, {
#     'clf__solver': ['saga'],
#     'clf__penalty': ['l2', 'l1'],
#     'clf__C': [0.1, 1.0, 10.0]
#     }, {
#     'clf__solver': ['saga'],
#     'clf__penalty': ['none']
#     }, {
#     'clf__solver': ['saga'],
#     'clf__penalty': ['elasticnet'],
#     'clf__C': [0.1, 1.0, 10.0],
#     'clf__l1_ratio': [0.2, 0.5, 0.8]
    }
]


pipe = Pipeline(steps=[('clf', clf)])
estimator = GridSearchCV(pipe, param_grid, cv=10, scoring=['accuracy', 'f1_macro'],
                         verbose=10, error_score="raise", n_jobs=-1, refit=False)
estimator.fit(X_train, y_train)

In [None]:
import pickle
import pandas as pd

with open('lr_results_2.pickle', 'rb') as f:
    lr_results = pickle.load(f)

In [None]:
lr_results.rename(axis=1, inplace=True, mapper={
    'param_pca__n_components': 'pca'
})


In [None]:
lr_results[
    ['mean_fit_time', 'solver', 'penalty', 'C', 'param_clf__l1_ratio', 'pca', 'mean_test_accuracy', 'std_test_accuracy',
    'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).groupby('solver').head(10)

In [None]:
lr_results[
    ['mean_fit_time', 'solver', 'penalty', 'C', 'param_clf__l1_ratio', 'pca', 'mean_test_f1_macro', 'std_test_f1_macro',
    'rank_test_f1_macro']
].sort_values(by=['rank_test_f1_macro']).groupby('solver').head(5)

In [None]:
import pickle

with open('train_features.pickle', 'rb') as f:
    X_train, y_train = pickle.load(f)

In [None]:
pca = PCA(n_components=100)
clf = LogisticRegression(max_iter=1000, solver="liblinear", penalty="l1", C=0.2)
pipe = Pipeline(steps=[('pca', pca), ('clf', clf)], memory='sklearn_tmp')
pipe.fit(X_train, y_train)

In [None]:
with open('test_features.pickle', 'rb') as f:
    X_test, y_test = pickle.load(f)

In [None]:
y_pred = pipe.predict(X_test)

print(sk.metrics.balanced_accuracy_score(y_test, y_pred))

In [None]:
knn_results[
    ['mean_fit_time', 'metric', 'neighbors', 'pca', 'mean_test_f1_macro', 'std_test_f1_macro',
    'rank_test_f1_macro']
].sort_values(by=['rank_test_f1_macro']).groupby('metric').head(4)

In [None]:
with open('knn_results_1.pickle', 'rb') as f:
    knn_results = pickle.load(f)

knn_results[
    ['mean_fit_time', 'metric', 'neighbors', 'pca', 'mean_test_accuracy', 'std_test_accuracy',
    'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).groupby('metric').head(5)

In [None]:
with open('knn_results_5.pickle', 'rb') as f:
    knn_results = pickle.load(f)

knn_results[
    ['mean_fit_time', 'metric', 'neighbors', 'pca', 'mean_test_accuracy', 'std_test_accuracy',
    'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).groupby('pca').head(4)

In [None]:
std = StandardScaler()
pca = PCA(n_components=50)
clf = KNeighborsClassifier(metric="euclidean", n_neighbors=13)

pipe = Pipeline(steps=[('std', std), ('pca', pca), ('clf', clf)], memory='sklearn_tmp')
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

print(sk.metrics.balanced_accuracy_score(y_test, y_pred))

In [None]:
import pickle

with open('tree_results_2.pickle', 'rb') as f:
    tree_results = pickle.load(f)

tree_results[
    ['mean_fit_time', 'criterion', 'pca', 'mean_test_accuracy', 'std_test_accuracy',
    'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).groupby('criterion').head(8)

In [None]:
std = StandardScaler()
pca = PCA(n_components=68)
clf = DecisionTreeClassifier(criterion="gini")

pipe = Pipeline(steps=[('std', std), ('pca', pca), ('clf', clf)], memory='sklearn_tmp')
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

print(sk.metrics.balanced_accuracy_score(y_test, y_pred))

In [None]:
import pickle

with open('nb_results_1.pickle', 'rb') as f:
    tree_results = pickle.load(f)

tree_results[
    ['mean_fit_time','pca', 'mean_test_accuracy', 'std_test_accuracy',
    'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).head(10)

In [None]:
pca = PCA(n_components=184)
qt = QuantileTransformer()
clf = GaussianNB()

pipe = Pipeline(steps=[('pca', pca), ('qt', qt), ('clf', clf)], memory='sklearn_tmp')
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

print(sk.metrics.balanced_accuracy_score(y_test, y_pred))

In [None]:
std = StandardScaler()
pca = PCA(n_components=68)
clf = DecisionTreeClassifier(criterion="gini")

pipe = Pipeline(steps=[('std', std), ('pca', pca), ('clf', clf)], memory='sklearn_tmp')
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

print(sk.metrics.balanced_accuracy_score(y_test, y_pred))

In [None]:
import pickle

with open('adaboost_results_1.pickle', 'rb') as f:
    tree_results = pickle.load(f)

tree_results[
    ['mean_fit_time', 'estimators', 'lr', 'pca', 'mean_test_accuracy', 'std_test_accuracy',
    'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).groupby('lr').head(5)

In [None]:
import pickle

with open('adaboost_results_4.pickle', 'rb') as f:
    tree_results = pickle.load(f)

tree_results[
    ['mean_fit_time', 'estimators', 'lr', 'pca', 'mean_test_accuracy', 'std_test_accuracy',
    'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).groupby('lr').head(5)

In [None]:
import pickle

with open('adaboost_results_5.pickle', 'rb') as f:
    tree_results = pickle.load(f)

tree_results[
    ['mean_fit_time', 'estimators', 'lr', 'pca', 'mean_test_accuracy', 'std_test_accuracy',
    'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).groupby('lr').head(5)

In [None]:
std = StandardScaler()
pca = PCA(n_components=30)
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=1800, learning_rate=2.0)

pipe = Pipeline(steps=[('std', std), ('pca', pca), ('clf', clf)], memory='sklearn_tmp')
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

print(sk.metrics.balanced_accuracy_score(y_test, y_pred))

In [None]:
import pickle

with open('svm_results_1.pickle', 'rb') as f:
    svm_results = pickle.load(f)

svm_results[
    ['mean_fit_time', 'pca #components', 'kernel', 'poly degree', 'gamma', 'coef0',
     'mean_test_accuracy', 'std_test_accuracy', 'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).head(25)

In [None]:
import pickle

with open('svm_results_2.pickle', 'rb') as f:
    svm_results = pickle.load(f)

svm_results[
    ['mean_fit_time', 'pca #components', 'kernel', 'poly degree', 'gamma', 'coef0', 'C',
     'mean_test_accuracy', 'std_test_accuracy', 'rank_test_accuracy']
].sort_values(by=['rank_test_accuracy']).groupby('kernel').head(10)

In [None]:
std = StandardScaler()
pca = PCA(n_components=250)
clf = SVC(cache_size=8000, kernel="poly", degree=4, gamma=0.001, coef0=0.5,  C=0.5)
pipe = Pipeline(steps=[('std', std), ('pca', pca), ('clf', clf)], memory='sklearn_tmp')
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

print(sk.metrics.balanced_accuracy_score(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

fig = plt.figure(figsize=(10,10))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=labels)
disp.plot(ax=fig.gca(), xticks_rotation="vertical")
plt.savefig("conf_matrix.png")
plt.show()

In [None]:
y_score = pipe.decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(20):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_score[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])


In [None]:
lw=2
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(20)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(20):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= 20

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(10,10))
# plt.plot(
#     fpr["micro"],
#     tpr["micro"],
#     label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
#     color="deeppink",
#     linestyle=":",
#     linewidth=4,
# )

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue", "green"])
for i, color in zip(range(20), colors):
    plt.plot(
        fpr[i],
        tpr[i],
#         color=color,
        lw=lw,
        label="ROC curve of artist {0} (area = {1:0.2f})".format(labels[i], roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.65, 1.01])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.savefig("roc_curve.png")
plt.show()

In [None]:
import sklearn as sk
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.decomposition import PCA



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
labels = ["Martiros Saryan",
"Albrecht Durer",
"Zdislav Beksinski",
"Claude Monte",
"Pyotr Konchalovsky",
"Raphael Kirchner",
"Rembrandt",
"Gustave Dore",
"Boris Kustodiev",
"Ivan Aivazovsky",
"Pablo Picasso",
"Marc Chagall",
"Ivan Shishkin",
"Paul Cezanne",
"Camille Pissarro",
"Pierre-Auguste Renoir",
"Paul Gauguin",
"Ilya Repin",
"Giovanni Battista Piranesi",
"John Singer Sargent"]