1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. Обучить любой классификатор (какой вам нравится)
3. Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть
4. Применить random negative sampling для построения классификатора в новых условиях
5. Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools
from sklearn.metrics import (f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix)

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("heart.csv")
df.head(5)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df['output'].value_counts()

1    165
0    138
Name: output, dtype: int64

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['output']), df['output'], random_state=0)

In [5]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [6]:
metrics = pd.DataFrame(columns=['thresholds', 'f-score', 'precision', 'recall', 'ROC AUC'])
metrics

Unnamed: 0,thresholds,f-score,precision,recall,ROC AUC


In [7]:
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.04418177, 0.84491847, 0.84801313, 0.02159199, 0.1443193 ,
       0.38381786, 0.04059292, 0.11447757, 0.00503596, 0.00435948])

In [8]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
roc_auc_score(y_test, preds)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.469055, F-Score=0.882, Precision=0.820, Recall=0.953


In [9]:
metrics = metrics.append({
    'model': 'first',
    'thresholds': thresholds[ix],
    'f-score': fscore[ix],
    'precision': precision[ix],
    'recall': recall[ix],
    'ROC AUC': roc_auc_score(y_test, preds)
}, ignore_index=True)

metrics

  metrics = metrics.append({


Unnamed: 0,thresholds,f-score,precision,recall,ROC AUC,model
0,0.469055,0.88172,0.82,0.953488,0.887949,first


In [10]:
mod_data = X_train.copy()
mod_data['output'] = y_train
mod_data.head()

pos_ind = mod_data[mod_data['output'] == 1].sample(frac=1, random_state=42).index

perc = 0.25
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 31/122 as positives and unlabeling the rest


In [11]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    196
 1     31
Name: class_test, dtype: int64


In [12]:
mod_data.head(5)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,class_test
173,58,1,2,132,224,0,0,173,0,3.2,2,2,3,0,-1
261,52,1,0,112,230,0,1,160,0,0.0,2,1,2,0,-1
37,54,1,2,150,232,0,0,165,0,1.6,2,0,3,1,1
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3,1,-1
166,67,1,0,120,229,0,0,129,1,2.6,1,2,3,0,-1


In [13]:
mod_data = mod_data.sample(frac=1)


data_N = mod_data[mod_data['class_test'] == -1]
data_P = mod_data[mod_data['class_test'] == 1]

neg_sample = data_N[:data_P.shape[0]]
sample_test = data_N[data_P.shape[0]:]
pos_sample = data_P.copy()

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(31, 15) (31, 15)


In [14]:
sample_train

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,class_test
193,60,1,0,145,282,0,0,142,1,2.8,1,2,3,0,-1
30,41,0,1,105,198,0,1,168,0,0.0,2,1,2,1,-1
206,59,1,0,110,239,0,0,142,1,1.2,1,1,3,0,-1
39,65,0,2,160,360,0,0,151,0,0.8,2,0,2,1,1
67,45,0,1,130,234,0,0,175,0,0.6,1,0,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,51,0,2,140,308,0,0,142,0,1.5,2,1,2,1,-1
170,56,1,2,130,256,1,0,142,1,0.6,1,1,1,0,-1
18,43,1,0,150,247,0,1,171,0,1.5,2,0,2,1,1
93,54,0,1,132,288,1,0,159,1,0.0,2,1,2,1,1


In [15]:
logreg = LogisticRegression()
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0
logreg.fit(sample_train.drop(columns=['class_test', 'output']), 
          sample_train['class_test'])

preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.27640824, 0.55763875, 0.3877974 , 0.11995442, 0.08290531,
       0.47629007, 0.17206064, 0.28556153, 0.01240449, 0.01055714])

In [16]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
roc_auc_score(y_test, preds)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.413135, F-Score=0.864, Precision=0.844, Recall=0.884


In [17]:
metrics = metrics.append({
    'model': 'pu',
    'thresholds': thresholds[ix],
    'f-score': fscore[ix],
    'precision': precision[ix],
    'recall': recall[ix],
    'ROC AUC': roc_auc_score(y_test, preds)
}, ignore_index=True)

metrics

  metrics = metrics.append({


Unnamed: 0,thresholds,f-score,precision,recall,ROC AUC,model
0,0.469055,0.88172,0.82,0.953488,0.887949,first
1,0.413135,0.863636,0.844444,0.883721,0.854123,pu
