# Two-Step Techniques (1-DNF & iterable SVM)

This method learns $P(y|x)$ by a two-step technique (step1: 1-DNF, step2: iterable SVM, step3: F1').

In [32]:
import numpy as np
import pandas as pd

### Load the SCAR dataset

In [33]:
from data import load_scar, load_sar, load_pg

train, valid, test, c = load_scar()

### Step1: 1-DNF

#### Find strong positive features based on the bias of the labeled data.
In this section, we consider the strong positive feature for each axis direction. Specifically, for i-th axis direction, let "$x_i > B_i$" be the strong positive feature, that $B_i$ satisfies the following two points;
1. 99% of the labeled data $x$ has the feature "$x_i > B_i$".
2. Of the $B_i$ s that satisfy 1, we choose the one with the largest $r$, where $r$ is the product of each ratio of the labeled and unlabeled data that satisfies "$x_i > B_i$".

In [34]:
train_xs, train_ys, train_ss, train_es = train

# Strong Positive Feature (SPF)
# SPF[i] is the threshold B_i
SPF = [None] * train_xs.shape[-1]

N_unlabeled = len(train_ss[train_ss == 0])
N_labeled = len(train_ss[train_ss == 1])

for axis in range(train_xs.shape[-1]):
    train_xs_ = list(zip(train_xs[:, axis], train_ss))
    train_xs_ = sorted(train_xs_)
    
    n_labeled, n_unlabeled = N_labeled, N_unlabeled
    r = (n_labeled / N_labeled) * (1 - n_unlabeled / N_unlabeled)
    B = train_xs_[0][0]
    px = - float('inf')
    for x, s in train_xs_:
        if s == 0:
            n_unlabeled -= 1
        elif s == 1:
            r_ = (n_labeled / N_labeled) * (1 - n_unlabeled / N_unlabeled)
            if x != px and n_labeled / N_labeled >= 0.99 and r < r_:
                r = r_
                B = x
            n_labeled -= 1
        px = x
    SPF[axis] = B

In [35]:
print(SPF)

[0.64934146, 0.7462863]


In [36]:
# Let the labeled data be positive, and the unlabeled data that does not satisfy any of the strong positive features be negative.
# The data neither positive nor negative is unlabeled.
# y_: the label (pos: 1, neg: 0, unlabeled: -1)
new_train_ys = np.array([0]*(len(train_xs)))
for i in range(len(train_xs)):
    if train_ss[i] == 1:
        new_train_ys[i] = 1
    elif all([train_xs[i][j] < SPF[j] for j in range(train_xs.shape[-1])]):
        new_train_ys[i] = 0
    else:
        new_train_ys[i] = -1

### Visualize the newly created dataset

In [37]:
from utils import plot_x_y, plot_x_s, plot_x_y_list

In [38]:
plot_x_s(train_xs, train_ss)

In [39]:
plot_x_y(train_xs[new_train_ys!=-1], new_train_ys[new_train_ys!=-1])

### Step2: Apply iteravle SVM for the semi-supervised dataset
Now, we have positive-labeled data, negative-labeled data, and unlabeled data.
We iterate the following steps until the classifier (SVM) is converged.
1. We learn an SVM classifier only using positive-labeled data and negative-labeled data.
2. We give a negative label to the unlabeled data determined to be negative by the SVM.
3. If no data is given a label in step2, the SVM is converged. If not, go back to step1.

In [40]:
from sklearn.svm import SVC
clfs = []
new_train_ys_ = new_train_ys.copy()
converged = False
while not converged:
    clf= SVC(kernel='linear', random_state=42)
    clf.fit(train_xs[new_train_ys_!=-1], new_train_ys_[new_train_ys_!=-1])
    new_train_ys_hat = clf.predict(train_xs)
    clfs.append(clf)
    if len(new_train_ys_[(new_train_ys_ == -1) & (new_train_ys_hat == 0)]) == 0:
        converged = True
    else:
        new_train_ys_[(new_train_ys_ == -1) & (new_train_ys_hat == 0)] = 0

The upper figure shows the classification results by SVM obtained at the first iteration, and the lower figure shows the classification results by SVM obtained at convergence.

In [41]:
plot_x_y_list(train_xs, [clfs[i].predict(train_xs) for i in [0, -1]])

### Step3: Choose the best classfier in terms of F1'
The SVM obtained at convergence is not necessarily the best. Therefore, we evaluate each of the SVMs obtained by the iterations with F1' and choose the best one.

In [42]:
from utils import f1_prime
f1_primes = [f1_prime(train_ss, clfs[i].predict(train_xs)) for i in range(len(clfs))]
optim_clf_idx = f1_primes.index(max(f1_primes))
plot_x_y(train_xs, clfs[optim_clf_idx].predict(train_xs))

### Predict

In [43]:
test_xs, test_ys, test_ss, test_es = test

test_ys_hat = clfs[optim_clf_idx].predict(test_xs)

In [44]:
plot_x_y(train_xs, clfs[optim_clf_idx].predict(train_xs))

### Evaluate the performance

In [54]:
from sklearn.metrics import f1_score
for i in range(len(clfs)):
    if i == optim_clf_idx:
        print("f1' for train: {:.3f}\tf1 for test: {:.3f}\tbest f1' for train".format(f1_primes[i], f1_score(test_ys, clfs[i].predict(test_xs))))
    else:
        print("f1' for train: {:.3f}\tf1 for test: {:.3f}".format(f1_primes[i], f1_score(test_ys, clfs[i].predict(test_xs))))


f1' for train: 1.214	f1 for test: 0.983
f1' for train: 1.226	f1 for test: 0.989
f1' for train: 1.227	f1 for test: 0.993	best f1' for train
f1' for train: 1.226	f1 for test: 0.994
f1' for train: 1.223	f1 for test: 0.993
f1' for train: 1.226	f1 for test: 0.991
f1' for train: 1.226	f1 for test: 0.989
f1' for train: 1.224	f1 for test: 0.988
f1' for train: 1.216	f1 for test: 0.986
f1' for train: 1.212	f1 for test: 0.985
f1' for train: 1.207	f1 for test: 0.982
f1' for train: 1.208	f1 for test: 0.981
f1' for train: 1.205	f1 for test: 0.980
f1' for train: 1.205	f1 for test: 0.979
f1' for train: 1.205	f1 for test: 0.978
f1' for train: 1.204	f1 for test: 0.978
f1' for train: 1.203	f1 for test: 0.976
f1' for train: 1.201	f1 for test: 0.976
f1' for train: 1.202	f1 for test: 0.975
f1' for train: 1.201	f1 for test: 0.975
f1' for train: 1.201	f1 for test: 0.975


### Visualize the result

In [46]:
plot_x_y(test_xs, test_ys)

In [47]:
plot_x_y(test_xs, test_ys_hat)