In [None]:
from indianpines.dataset import load as load_pines
from s3vm_pines.module import recategorize17to10_csv, train_test_split, colored_map, labeled_unlabeled_test_split

pines = load_pines(pca=20,recategorize_rule=recategorize17to10_csv, include_background=True,gt_gic=False)

実験1では，Backgroundラベル10705インスタンスを除く10320インスタンスから，$10\%$をトレーニングデータとして選び，それをラベル付きとラベル無しに分割する(p:1-p）．トレーニングデータ以外の$90\%$を評価用のテストデータとする．

ここで，トレーニングデータは，無作為に選ばれたものではなく，ラベルごとに空間的に密集していることに注意されたい．これは土地被覆分類におけるアノテーションの特性を反映したものである．

トレーニングデータを @fig_train に示す．

In [None]:
prop_train = 1348/10320
print(prop_train)

status, status_name = train_test_split(prop_train=prop_train, recategorize_rule=recategorize17to10_csv, gt_gic=False)

for i, s in enumerate(status_name):
    n = status[status==i].shape[0]
    print(f'{i}:{s}:{n}')

In [None]:
#| label: fig_train
#| fig-cap: "Selected Training Data"

from matplotlib import pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
colored_map(ax, pines.target[status==2], pines.cordinates[status==2], recategorize_rule=recategorize17to10_csv, gt_gic=False)
ax.set_title(f'training data {prop_train = }')
fig.tight_layout()
fig.savefig("exp01_fig01.png")

In [None]:
def counter2hms(c):
    H = 60*60
    M = 60
    h, c = c//H, c%H
    m, s = c//M, c%M
    return int(h),int(m),int(s)

## ラベル付きトレーニングデータの割合に対する評価


In [None]:
#| label: fig_test_with_various_labeled
#| fig-cap: "Testing with Various Labeled Data Proportion"
#| output: true

from qns3vm import QN_S3VM_OVR
from qns3vm.tools import classification_error
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import random
from matplotlib import style
style.use(['science','ieee'])
from time import sleep, perf_counter

rg = random.Random()

seed_l = 83988848
seed_u = 83988848

prop_train_l_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

acc = list()
err = list()
f1 = list()
fig2 = [0 for _ in prop_train_l_list]
ax2 = [0 for _ in prop_train_l_list] * 3

t_start = perf_counter()
t_old = t_start
for i, prop_train_l in enumerate(prop_train_l_list):
    print('='*3 + f'{prop_train_l = }' + '='*3)
    l_u_t_status, l_u_t_status_name = labeled_unlabeled_test_split(prop_train_l=prop_train_l,status=status,unlabeled_type = 'from_train',seed_l=seed_l, seed_u=seed_u, recategorize_rule=recategorize17to10_csv, gt_gic=False)

    for j, s in enumerate(l_u_t_status_name):
        n = l_u_t_status[l_u_t_status==j].shape[0]
        print(f'{j}:{s}:{n}')

    X_train_l = pines.features[l_u_t_status==3].tolist()
    L_train_l = pines.target[l_u_t_status==3].tolist()
    X_train_u = pines.features[l_u_t_status==2].tolist()
    L_train_u = pines.target[l_u_t_status==2].tolist()
    X_test = pines.features[l_u_t_status==1].tolist()
    L_test = pines.target[l_u_t_status==1].tolist()

    fig2[i] = plt.figure(figsize=(12.8,19.2), constrained_layout=True)
    # labeled
    ax2[3*i+0] = fig2[i].add_subplot(4,2,1)
    colored_map(ax2[3*i+0], L_train_l, pines.cordinates[l_u_t_status==3], recategorize_rule=recategorize17to10_csv, gt_gic=False)
    ax2[3*i+0].set_title(f'labeled area {prop_train_l =}')
    # unlabeled
    ax2[3*i+1] = fig2[i].add_subplot(4,2,3)
    colored_map(ax2[3*i+1], L_train_u, pines.cordinates[l_u_t_status==2], recategorize_rule=recategorize17to10_csv, gt_gic=False)
    ax2[3*i+1].set_title(f'unlabeled area {prop_train_l =}')

    clf = QN_S3VM_OVR(X_train_l, L_train_l, X_train_u, random_generator=rg, lam=0.008765625, lamU=0.8, sigma=0.5, kernel_type="RBF", estimate_r=0.0)
    clf.train()

    preds = clf.predict(X_test)
    err_ = classification_error(preds,L_test)
    acc_ = accuracy_score(L_test, preds)
    f1_ = f1_score(L_test, preds, average="micro")

    ax2[3*i+2] = fig2[i].add_subplot(2,1,2)
    colored_map(ax2[3*i+2], preds, pines.cordinates[l_u_t_status==1], recategorize_rule=recategorize17to10_csv, gt_gic=False)
    ax2[3*i+2].set_title(f'predicted label for tested area {prop_train_l =}, {acc_ = }, {f1_ = }')

    fig2[i].tight_layout(rect=[0,0,1,0.96])
    fig2name = f'exp01_fig02_{i:0d}.png'
    fig2[i].savefig(fig2name)

    err.append(err_)
    acc.append(acc_)
    f1.append(f1_)

    # counter
    t_now = perf_counter()
    t_lap = t_now-t_old
    t_ave = (t_now-t_start)/(i+1)
    t_rest = t_ave * (len(prop_train_l_list)-(i+1))
    ho,mo,so = counter2hms(t_old)
    hl,ml,sl = counter2hms(t_lap)
    hr,mr,sr = counter2hms(t_rest)
    print(f'lap {i+1} | {hl:2d}:{ml:2d}:{sl:2d} + {ho:2d}:{mo:2d}:{so:2d}, est {hr:2d}:{mr:2d}:{sr:2d}')
    t_old = t_now

fig3 = plt.figure()
ax3 = fig3.add_subplot(1,1,1)
ax3.plot(prop_train_l_list, acc, 'o', label='accuracy score')
ax3.plot(prop_train_l_list, f1, 's--', label='f1 score')
ax3.legend()
ax3.set_xlabel(r'Amount of Labeled Data [$\%$]')
ax3.set_ylabel(r'Score[$\%$]')
ax3.set_title('Testing with Various Labeled Data Proportion')
fig3.tight_layout(rect=[0,0,1,0.96])
fig3.savefig("exp01_03.png")

@fig_test_with_various_labeled に示すようにラベル付きデータがトレーニングデータの20%以上のときにエラー率が低くなるので，20％に固定する．

その上で，ラベル無しデータの割合を10%から10％刻みで80%まで変えてみる．結果を @fig_test_with_various_unlabeled に示す．

In [None]:
#| label: fig_test_with_various_unlabeled
#| fig-cap: "Testing with Various Unlabeled Data Proportion"
#| output: true

import numpy as np

prop_train_l = 0.2
prop_train_u_list = [0.1, 0.2, 0.3, 0.4 ,0.5, 0.6, 0.7, 0.8]

seed_l = 83988848
seed_u = 83988848

l_u_t_status, l_u_t_status_name = labeled_unlabeled_test_split(prop_train_l=prop_train_l,status=status,unlabeled_type='from_train', seed_l=seed_l, seed_u=seed_u, recategorize_rule=recategorize17to10_csv, gt_gic=False)

X_train_l = pines.features[l_u_t_status==3].tolist()
L_train_l = pines.target[l_u_t_status==3].tolist()
X_test = pines.features[l_u_t_status==1].tolist()
L_test = pines.target[l_u_t_status==1].tolist()

def select_unlabeled_in_proportion(prop_train_u,l_u_t_status,status,target):
    labels = sorted(list(set(target)))
    selected = np.zeros_like(l_u_t_status)
    for t in labels:
        idx = list()
        for i,ti in enumerate(target):
            if ti == t:
                if l_u_t_status[i] == 2:
                    idx.append(i)
        selected[idx] = 1
        n_train = len([1 for s in status if s == 2])
        n_selected = int(np.ceil(prop_train_u * n_train))
        selected[idx[:n_selected]] = 2
    selected_name = ['un-traininig_or_labeled', 'unselected_unlabeled', 'selected_unlabeled']
    return selected, selected_name

acc = list()
err = list()
f1 = list()
fig4 = [0 for _ in prop_train_u_list]
ax4 = [0 for _ in prop_train_u_list] * 3
t_start = perf_counter()
t_old = t_start
for i, prop_train_u in enumerate(prop_train_u_list):
    print('='*3 + f'{prop_train_u = }' + '='*3)
    selected, selected_name = select_unlabeled_in_proportion(prop_train_u, l_u_t_status, status, target=pines.target)

    for j, s in enumerate(l_u_t_status_name):
        n = l_u_t_status[l_u_t_status==j].shape[0]
        if j == 2:
            n = selected[selected == 2].shape[0]
        print(f'{j}:{s}:{n}')

    X_train_u = pines.features[selected==2].tolist()
    l_train_u = pines.target[selected==2].tolist()

    fig4[i] = plt.figure(figsize=(12.8,19.2), constrained_layout=True)

    ax4[3*i+0] = fig4[i].add_subplot(4,2,1)
    colored_map(ax4[3*i+0], pines.target[l_u_t_status==3], pines.cordinates[l_u_t_status==3], recategorize_rule=recategorize17to10_csv, gt_gic=False)
    ax4[3*i+0].set_title(f'labeled train area {prop_train_u =}')

    ax4[3*i+1] = fig4[i].add_subplot(4,2,3)
    colored_map(ax4[3*i+1], pines.target[selected==2], pines.cordinates[selected==2], recategorize_rule=recategorize17to10_csv, gt_gic=False)
    ax4[3*i+1].set_title(f'unlabeled train area {prop_train_u =}')

    clf = QN_S3VM_OVR(X_train_l, L_train_l, X_train_u, random_generator=rg, lam=0.008765625, lamU=0.8, sigma=0.5, kernel_type="RBF", estimate_r=0.0)
    clf.train()
    preds = clf.predict(X_test)

    ax4[3*i+2] = fig4[i].add_subplot(2,1,2)
    colored_map(ax4[3*i+2], preds, pines.cordinates[l_u_t_status==1], recategorize_rule=recategorize17to10_csv, gt_gic=False)
    ax4[3*i+1].set_title(f'predicted label for test data {prop_train_u =}')

    fig4[i].tight_layout(rect=[0,0,1,0.96])
    fig4name=f'exp01_fig04_{i:0d}.png'
    fig4[i].savefig(fig4name)

    err_ = classification_error(preds,L_test)
    err.append(err_)
    acc_ = accuracy_score(L_test, preds)
    acc.append(acc)
    f1_ = f1_score(L_test, preds, average="micro")
    f1.append(f1_)

    # counter
    t_now = perf_counter()
    t_lap = t_now-t_old
    t_ave = (t_now-t_start)/(i+1)
    t_rest = t_ave * (len(prop_train_l_list)-(i+1))
    ho,mo,so = counter2hms(t_old)
    hl,ml,sl = counter2hms(t_lap)
    hr,mr,sr = counter2hms(t_rest)
    print(f'lap {i+1} | {hl:2d}:{ml:2d}:{sl:2d} + {ho:2d}:{mo:2d}:{so:2d}, est {hr:2d}:{mr:2d}:{sr}')
    t_old = t_now

fig5 = plt.figure()
ax5 = fig5.add_subplot(1,1,1)
ax5.plot(prop_train_u_list, acc, 'o', label='accuracy score')
ax5.plot(prop_train_u_list, f1, 's--', label='f1 score')
ax5.legend()
ax5.set_xlabel(r'Amount of Unlabeled Data [%%]')
ax5.set_ylabel(r'Accuracy Score [%%]')
ax5.set_title('Testing with Various Unlabeled Data Proportion')
fig5.tight_layout(rect=[0,0,1,0.96])
fig5.savefig("exp01_fig05.png")