# Empirical risk minimization

This method creates a new dataset from PU data so that a classifier learned from it is expected to be equal to a classifier trained from a fully labeled dataset.

In [None]:
import numpy as np
import pandas as pd

### Load the dataset

In [None]:
from data import load_scar, load_sar, load_pg

train, valid, test, c = load_scar()

### Create a new dataset

- Create a new dataset from PU data so that a classifier learned from it is expected to be equal to a classifier trained from a fully labeled dataset.
- In the newly created dataset,
  - Labeled data is used as positive with weight $\frac{1}{e(x)}$.
  - Labeled data is used as negative as well with weight $1 - \frac{1}{e(x)}$.
  - Unlabeled data is used as negative with weight $1$.

In [None]:
train_xs, train_ys, train_ss, train_es = train

train_xs_labeled = train_xs[train_ss == 1]
train_xs_unlabeled = train_xs[train_ss == 0]

train_es_labeled = train_es[train_ss == 1]

new_train_xs = []
new_train_ys = []
sample_weight = []

# Labeled data is used as positive ($y=1$)
for train_x_labeled, train_e_labeled in zip(train_xs_labeled, train_es_labeled):
    new_train_xs.append(train_x_labeled)
    new_train_ys.append(1)
    sample_weight.append(1 / train_e_labeled)

# Labeled data is used as negative ($y=0$) as well
for train_x_labeled, train_e_labeled in zip(train_xs_labeled, train_es_labeled):
    new_train_xs.append(train_x_labeled)
    new_train_ys.append(0)
    sample_weight.append(1 - 1 / train_e_labeled)

# Unlabeled data is used as negative ($y=0$)
for train_x_unlabeled in train_xs_unlabeled:
    new_train_xs.append(train_x_unlabeled)
    new_train_ys.append(0)
    sample_weight.append(1)

### Learn a classifier

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(new_train_xs, new_train_ys, sample_weight=sample_weight)

### Predict

In [None]:
test_xs, test_ys, test_ss, test_es = test

test_ys_hat = clf.predict(test_xs)
test_ys_prob = clf.predict_proba(test_xs)[:, 1]

### Evaluate the performance

In [None]:
from sklearn.metrics import f1_score

f1_score(test_ys, test_ys_hat)

### Visualize the result

In [None]:
from utils import plot_x_y, plot_x_y_proba

In [None]:
plot_x_y(test_xs, test_ys_hat)

In [None]:
plot_x_y_proba(test_xs, test_ys_prob)