In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
INPUT_DIR = 'drive/MyDrive/atmaCup/#11/dataset_atmaCup11/'
train = pd.read_csv(INPUT_DIR + 'train.csv')
test = pd.read_csv(INPUT_DIR + 'test.csv')
techniques = pd.read_csv(INPUT_DIR + 'techniques.csv')
materials = pd.read_csv(INPUT_DIR + 'materials.csv')
submission = pd.read_csv(INPUT_DIR + 'atmaCup#11_sample_submission.csv')

In [None]:
train.head()

Unnamed: 0,object_id,sorting_date,art_series_id,target
0,002bff09b09998d0be65,1631,509357f67692a6a45626,1
1,00309fb1ef05416f9c1f,1900,7987b47bbe5dc3039179,3
2,003a1562e97f79ba96dc,1834,ded7c3c9636708e5b14c,3
3,004890880e8e7431147b,1743,582ac2d7f0cef195b605,2
4,00718c32602425f504c1,1885,64c907f0c08dce4fb8e8,3


In [None]:
test.head()

Unnamed: 0,object_id
0,0009e50b94be36ccad39
1,000bd5e82eb22f199f44
2,0015f44de1854f617516
3,00388a678879ba1efa27
4,005e1e7c6496902d23f3


In [None]:
techniques.head()

Unnamed: 0,name,object_id
0,pen,002bff09b09998d0be65
1,brush,00309fb1ef05416f9c1f
2,counterproof,004890880e8e7431147b
3,brush,007f5e3620b458d77212
4,brush,00bf812ffe8a62d45661


In [None]:
materials.head()

Unnamed: 0,name,object_id
0,ink,002bff09b09998d0be65
1,paper,002bff09b09998d0be65
2,pencil,002bff09b09998d0be65
3,watercolor (paint),00309fb1ef05416f9c1f
4,paper,00309fb1ef05416f9c1f


In [None]:
submission.head()

Unnamed: 0,target
0,3
1,1
2,3
3,3
4,0


In [None]:
techniques['name'].value_counts()

brush                   2179
pen                     1569
counterproof              17
chalk                      3
snipping                   2
pencil                     2
engraving                  2
sewing                     1
letterpress printing       1
scoring                    1
Name: name, dtype: int64

In [None]:
materials['name'].value_counts()

paper                          3746
ink                            1589
chalk                          1509
pencil                         1135
watercolor (paint)              543
deck paint                      130
graphite (mineral)              122
prepared paper                   91
parchment (animal material)      61
paint (coating)                  43
gouache (paint)                  38
oil paint (paint)                33
cardboard                        18
tracing paper                     8
linen (material)                  3
India ink (ink)                   2
leather                           2
velvet (fabric weave)             1
varnish                           1
zinc                              1
metal                             1
gold leaf                         1
palm leaf (material)              1
wood (plant material)             1
bristol board                     1
Name: name, dtype: int64

Stratified Group k-Fold

In [None]:
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [None]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [None]:
train_x = train.drop('target', axis=1)
train_y = train.target.values
groups = np.array(train.art_series_id.values)

def get_distribution(y_vals):
    y_distr = Counter(y_vals)
    y_vals_sum = sum(y_distr.values())
    return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals) + 1)]

In [None]:
distrs = [get_distribution(train_y)]
index = ['training set']

for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5)):
    dev_y, val_y = train_y[dev_ind], train_y[val_ind]
    dev_groups, val_groups = groups[dev_ind], groups[val_ind]
    
    assert len(set(dev_groups) & set(val_groups)) == 0
    
    distrs.append(get_distribution(dev_y))
    index.append(f'development set - fold {fold_ind}')
    distrs.append(get_distribution(val_y))
    index.append(f'validation set - fold {fold_ind}')

display('Distribution per class:')
pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(train_y) + 1)])

'Distribution per class:'

Unnamed: 0,Label 0,Label 1,Label 2,Label 3
training set,12.07%,22.76%,38.38%,26.80%
development set - fold 0,12.07%,22.74%,38.37%,26.81%
validation set - fold 0,12.04%,22.81%,38.40%,26.74%
development set - fold 1,12.06%,22.76%,38.38%,26.79%
validation set - fold 1,12.07%,22.74%,38.37%,26.81%
development set - fold 2,12.06%,22.76%,38.38%,26.79%
validation set - fold 2,12.07%,22.74%,38.37%,26.81%
development set - fold 3,12.06%,22.76%,38.38%,26.79%
validation set - fold 3,12.07%,22.74%,38.37%,26.81%
development set - fold 4,12.06%,22.76%,38.38%,26.79%


写真の数

In [None]:
import os

In [None]:
DIR = INPUT_DIR + 'photos'

print(sum(os.path.isfile(os.path.join(DIR, name)) for name in os.listdir(DIR)))

9856


In [None]:
# !pip install ipyplot

In [None]:
# import ipyplot

# plots = train[train.target == 0].sample(30)['object_id'].map(lambda x:'photos/'+x+'.jpg').values
# ipyplot.plot_images(plots)