# Two-Step Techniques (Spy & NB)

This method learns $P(y|x)$ by a two-step technique (step1: Spy, step2: Gaussian NB).

In [None]:
import numpy as np
import pandas as pd

### Load the dataset

In [None]:
from data import load_scar, load_sar, load_pg

train, valid, test, c = load_scar()

### Step1: Spy

#### Choose "spy" data from labeled data

In [None]:
np.random.seed(0)

train_xs, train_ys, train_ss, train_es = train

labeled_data_indexes = np.where(train_ss == 1)[0]

n_spy = round(len(labeled_data_indexes) * 0.1)  # 0.1 is the ratio of spy

spy_data_indexes = np.random.choice(labeled_data_indexes, n_spy, replace=False)
spy = np.zeros_like(train_ss)
spy[spy_data_indexes] = 1

#### Learn a non-traditional classifier $P(s'|x)$

In [None]:
# s_ (s'): a new label indicator after regarding spy data as unlabeled
new_train_ss = train_ss & (1 - spy)

In [None]:
from sklearn.linear_model import LogisticRegression

_clf = LogisticRegression(random_state=0).fit(train_xs, new_train_ss)

#### Find $P_{spymin} = \min P(s_|x, spy=1)$

In [None]:
train_ss_prob = _clf.predict_proba(train_xs)[:, 1]
threshold = train_ss_prob[spy.astype(bool)].min()

#### Create a new dataset, regarding data points satisfying $P(s_|x) < P_{spymin}$ as negative

In [None]:
# x_: the dataset consisting of positive-labeled data and negative-labeled data from the dataset x
# y_: the label
data_indexes_pos = labeled_data_indexes
xs_pos = train_xs[data_indexes_pos]
ys_pos = np.ones(len(xs_pos), dtype=np.int32)

data_indexes_neg = np.where(train_ss_prob < threshold)[0]
xs_neg = train_xs[data_indexes_neg]
ys_neg = np.zeros(len(xs_neg), dtype=np.int32)

new_train_xs = np.concatenate([xs_neg, xs_pos])
new_train_ys = np.concatenate([ys_neg, ys_pos])

### Visualize the newly created dataset

In [None]:
from utils import plot_x_y, plot_x_s

In [None]:
plot_x_s(train_xs, spy)

In [None]:
plot_x_y(new_train_xs, new_train_ys)

### Step2: Learn a Gaussian NB classifier P(y_|x_)
Now, we have positive-labeled data (s=1), negative-labeled data (s=0 & P(s_|x) < P_spymin), and unlabeled data (otherwise).
We learn a classifier only using positive-labeled data and negative-labeled data.

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB().fit(new_train_xs, new_train_ys)

### Predict

In [None]:
test_xs, test_ys, test_ss, test_es = test

test_ys_hat = clf.predict(test_xs)
test_ys_prob = clf.predict_proba(test_xs)[:, 1]

### Evaluate the performance

In [None]:
from sklearn.metrics import f1_score

f1_score(test_ys, test_ys_hat)

### Visualize the result

In [None]:
from utils import plot_x_y_proba

In [None]:
plot_x_y(test_xs, test_ys_hat)