### Import packages and load dataset ###

In [29]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression

# Load data:
data = load_breast_cancer()
Xall = data.data
yall = data.target

# Train/test split:
X, Xtest, y, ytest = train_test_split(Xall, yall, test_size=0.25, random_state=123)

### Create PU training data ###

1. y occurence of disease (y=1 present, y=0 absent)
2. s information of diagnosed disease (s=1 diagnosed, s=0 undiagnosed)
3. We do not observe y in our training data, we only have an access to the s variable.

In [30]:
c = 0.5 #true label frequency P(S=1|Y=1): probabiliy of being labelled for positive example
n = X.shape[0]
s = np.zeros(n)

for i in np.arange(0,n,1):
    if y[i]==1:
        s[i]=np.random.binomial(1, c, size=1)



### Methods ###

In [31]:
#Oracle method (assumes the knowledge of true class variable Y)
model = LogisticRegression()
model.fit(X,y)
prob_y_test = model.predict_proba(Xtest)[:,1] 
acc = accuracy_score(ytest, np.where(prob_y_test>0.5,1,0))
print("Accuracy for oracle method:",np.round(acc,4),"\n")

#Naive method (treats unlabelled examples as negative)
model = LogisticRegression()
model.fit(X,s)
prob_y_test = model.predict_proba(Xtest)[:,1] 
acc = accuracy_score(ytest, np.where(prob_y_test>0.5,1,0))
print("Accuracy for naive method:",np.round(acc,4),"\n")

#Method assuming the knowledge of class prior P(Y=1) (prevalence of a disease)
alpha = np.mean(y)
hat_c = np.mean(s)/alpha
model = LogisticRegression()
model.fit(X,s)
prob_y_test_naive = model.predict_proba(Xtest)[:,1]
prob_y_test = prob_y_test_naive/hat_c
acc = accuracy_score(ytest, np.where(prob_y_test>0.5,1,0))
print("Accuracy for calibrated method based on known class prior:",np.round(acc,4),"\n")


#Method based on simple estimator of c:
model = LogisticRegression()
model.fit(X,s)
prob_y_train = model.predict_proba(X)[:,1]
hat_c = np.max(prob_y_train)
prob_y_test_naive = model.predict_proba(Xtest)[:,1]
prob_y_test = prob_y_test_naive/hat_c
acc = accuracy_score(ytest, np.where(prob_y_test>0.5,1,0))
print("Accuracy for calibrated method with estimated c:",np.round(acc,4),"\n")

Accuracy for oracle method: 0.986 

Accuracy for naive method: 0.6783 

Accuracy for calibrated method: 0.9441 

Accuracy for calibrated method wit estimated c: 0.8531 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### Aditional tasks ###

1. Change c =0.1, 0.2, 0.8, 0.9 and repeat the above steps
2. Repeat the experiments for other datasets
3. [Optional] Create PU dataset in which probability of labelling positive example depend on feature vector