Домашнее задание
1. взять любой набор данных для бинарной классификации (можно скачать один с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 6 (как будет меняться качество модели при уменьшении/увеличении размера P)

Данные взяты отсюда: <br>
https://archive-beta.ics.uci.edu/dataset/45/heart+disease

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import catboost as catb
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [2]:
data = pd.read_csv("processed.cleveland.data", na_values=['?'], header=None)
data.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
                'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


**id:** patient identification number <br>
**age:** age in years <br>
**sex:** sex (1 = male; 0 = female) <br>
**cp:** chest pain type <br>
-- Value 1: typical angina <br>
-- Value 2: atypical angina <br>
-- Value 3: non-anginal pain <br>
-- Value 4: asymptomatic <br>
**trestbps:** resting blood pressure (in mm Hg on admission to the hospital) <br>
**chol:** serum cholestoral in mg/dl<br>
**fbs:** (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)<br>
**restecg:** resting electrocardiographic results<br>
-- Value 0: normal<br>
-- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)<br>
-- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria<br>
**thalach:** maximum heart rate achieved<br>
**exang:** exercise induced angina (1 = yes; 0 = no)<br>
**oldpeak** = ST depression induced by exercise relative to rest<br>
**slope:** the slope of the peak exercise ST segment<br>
-- Value 1: upsloping<br>
-- Value 2: flat<br>
-- Value 3: downsloping<br>
**ca:** number of major vessels (0-3) colored by flourosopy<br>
**thal:** 3 = normal; 6 = fixed defect; 7 = reversable defect<br>
**num:** diagnosis of heart disease (angiographic disease status)<br>
Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0)

То есть тут таргетный признак - num, где 0 - отсутствие болезни, 1-4 - наличие(?)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB


In [4]:
data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [5]:
data['ca'].fillna(0, inplace=True)
data['thal'].fillna(3, inplace=True)

data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [6]:
int_columns = ['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'ca', 'cp', 'restecg', 'slope', 'thal']
data[int_columns] = data[int_columns].astype(int)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int32  
 1   sex       303 non-null    int32  
 2   cp        303 non-null    int32  
 3   trestbps  303 non-null    int32  
 4   chol      303 non-null    int32  
 5   fbs       303 non-null    int32  
 6   restecg   303 non-null    int32  
 7   thalach   303 non-null    int32  
 8   exang     303 non-null    int32  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int32  
 11  ca        303 non-null    int32  
 12  thal      303 non-null    int32  
 13  num       303 non-null    int64  
dtypes: float64(1), int32(12), int64(1)
memory usage: 19.1 KB


In [7]:
# Feature engineering (ну один feature добавим))))))

data['chol_outlier'] = 0
data.loc[data['chol'] >= 200, 'chol_outlier'] = 1

data.loc[data['num'] != 0, 'num'] = 1

data.drop(['chol'], axis=1, inplace=True)

data.head()

Unnamed: 0,age,sex,cp,trestbps,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,chol_outlier
0,63,1,1,145,1,2,150,0,2.3,3,0,6,0,1
1,67,1,4,160,0,2,108,1,1.5,2,3,3,1,1
2,67,1,4,120,0,2,129,1,2.6,2,2,7,1,1
3,37,1,3,130,0,0,187,0,3.5,3,0,3,0,1
4,41,0,2,130,0,2,172,0,1.4,1,0,3,0,1


In [8]:
target_name = 'num'

X = data.drop(columns=target_name)
y = data[target_name]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [9]:
cat_features_names = ['cp', 'restecg', 'slope', 'thal']

model = catb.CatBoostClassifier(silent=True, random_state=21, cat_features=cat_features_names)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [10]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 81.36%
roc: 82.33%
recall: 75.00%
precision: 88.89%


In [11]:
mod_data = data.copy()

# получаем индексы положительных образцов
pos_ind = np.where(mod_data.iloc[:,-2] == 1)[0]

# перемешиваем
np.random.shuffle(pos_ind)

# оставляем 25% от числа позитивчиков
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 35/139 as positives and unlabeling the rest


In [12]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    268
 1     35
Name: class_test, dtype: int64


In [13]:
mod_data.head(10)

Unnamed: 0,age,sex,cp,trestbps,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,chol_outlier,class_test
0,63,1,1,145,1,2,150,0,2.3,3,0,6,0,1,-1
1,67,1,4,160,0,2,108,1,1.5,2,3,3,1,1,-1
2,67,1,4,120,0,2,129,1,2.6,2,2,7,1,1,1
3,37,1,3,130,0,0,187,0,3.5,3,0,3,0,1,-1
4,41,0,2,130,0,2,172,0,1.4,1,0,3,0,1,-1
5,56,1,2,120,0,0,178,0,0.8,1,0,3,0,1,-1
6,62,0,4,140,0,2,160,0,3.6,3,2,3,1,1,-1
7,57,0,4,120,0,0,163,1,0.6,1,0,3,0,1,-1
8,63,1,4,130,0,2,147,0,1.4,2,1,7,1,1,1
9,53,1,4,140,1,2,155,1,3.1,3,0,7,1,1,-1


In [16]:
x_data = mod_data.drop(columns=['class_test', 'num'])
y_labeled = mod_data.iloc[:,-1].values # новый класс (P & U)
y_positive = mod_data.iloc[:,-3].values # оригинальный класс

In [18]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(35, 15) (35, 15)


In [19]:
model = catb.CatBoostClassifier(silent=True)

model.fit(sample_train.iloc[:,:-3].values, 
          sample_train.iloc[:,-3].values)
y_predict = model.predict(sample_test.iloc[:,:-3].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 65.19%
roc: 52.19%
recall: 53.09%
precision: 84.43%
