# Incorporating labeling probability

This method learns $P(y|x)$ from PU data by incorporating the labeling mechanism.

In [None]:
import numpy as np
import pandas as pd

### Load the dataset

In [None]:
from data import load_scar, load_sar, load_pg

train, valid, test, c = load_scar()

### Learn a non-traditional classifier

In [None]:
from sklearn.linear_model import LogisticRegression

train_xs, train_ys, train_ss, train_es = train

_clf = LogisticRegression(random_state=0).fit(train_xs, train_ss)

### Create a new dataset

- Create a new dataset by regarding labeled data as positive data and unlabeled data as both positive and negative data.
- Assign a weight to each example as follows:
  - Assign $1$ to positive data derived from labeled data.
  - Assign $w^+ = \frac{1 - c}{c}\frac{P(s=1|x)}{1 - P(s=1|x)}$ to positive data derived from unlabeled data.
  - Assign $w^- = 1 - w^+$ to negative data derived from unlabeled data.

In [None]:
train_xs_labeled = train_xs[train_ss == 1]
train_xs_unlabeled = train_xs[train_ss == 0]

train_ss_prob_unlabeled = _clf.predict_proba(train_xs_unlabeled)[:, 1]

new_train_xs = []
new_train_ys = []
sample_weight = []

# use labeled data as positive
for train_x_labeled in train_xs_labeled:
    new_train_xs.append(train_x_labeled)
    new_train_ys.append(1)
    sample_weight.append(1)

# use unlabeled data as
for train_x_unlabeled, train_s_prob_unlabeled in zip(train_xs_unlabeled, train_ss_prob_unlabeled):
    # positive
    new_train_xs.append(train_x_unlabeled)
    new_train_ys.append(1)
    sample_weight.append(((1 - c) / c) * (train_s_prob_unlabeled / (1 - train_s_prob_unlabeled)))
    
    # negative
    new_train_xs.append(train_x_unlabeled)
    new_train_ys.append(0)
    sample_weight.append(1 - ((1 - c) / c) * (train_s_prob_unlabeled / (1 - train_s_prob_unlabeled)))

### Learn a classifier

In [None]:
clf = LogisticRegression(random_state=0).fit(new_train_xs, new_train_ys, sample_weight=sample_weight)

### Predict

In [None]:
test_xs, test_ys, test_ss, test_es = test

test_ys_hat = clf.predict(test_xs)
test_ys_prob = clf.predict_proba(test_xs)[:, 1]

### Evaluate the performance

In [None]:
from sklearn.metrics import f1_score

f1_score(test_ys, test_ys_hat)

### Visualize the result

In [None]:
from utils import plot_x_y, plot_x_y_proba

In [None]:
plot_x_y(test_xs, test_ys)

In [None]:
plot_x_y(test_xs, test_ys_hat)

In [None]:
plot_x_y_proba(test_xs, test_ys_prob)