# Практическое задание к уроку "Задача look-alike"

Домашнее задание
взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)


In [66]:
import pandas as pd
import numpy as np
from my_pipeline_selectors import FeatureSelector, OHEEncoder, NumberSelector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler

In [67]:
passengers_test = pd.read_csv('./test.csv')
passengers_train = pd.read_csv('./train.csv')
passengers_train_copy = passengers_train.copy()
passengers_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [68]:
passengers_train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

**сделать feature engineering**

In [69]:
categorical_columns = ['Embarked', 'Sex']
continuous_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
        ('selector', FeatureSelector(column=cat_col)),
        ('ohe', OHEEncoder(key=cat_col))
    ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
        ('selector', NumberSelector(key=cont_col)),
        ('standardizer', StandardScaler())
    ])
    final_transformers.append((cont_col, cont_transformer))

In [70]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])

In [71]:
from xgboost import XGBClassifier

xgboost_pipeline = Pipeline([
    ('features', feats),
    ('classifier', XGBClassifier(objective="binary:logistic", use_label_encoder=False, random_state=42)),
])

**обучить любой классификатор (какой вам нравится) и посчитать метрики качества (roc auc, pr/rec/f1, logloss)**

In [72]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

X = passengers_train.drop(columns=['Survived', 'Name', 'PassengerId', 'Ticket', 'Cabin'])
y = passengers_train['Survived']

xgboost_precision_scores = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="precision_macro").mean()
xgboost_recall_scores = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="recall_macro").mean()
xgboost_roc_auc_scores = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="roc_auc").mean()
xgboost_f1_score_scores = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="f1_macro").mean()



In [73]:
data = {
    'xgboost': [
        np.mean(xgboost_precision_scores),
        np.mean(xgboost_recall_scores),
        np.mean(xgboost_roc_auc_scores),
        np.mean(xgboost_f1_score_scores)
    ]
}
before = pd.DataFrame.from_dict(data, orient='index', columns=['precision', 'recall', 'roc_auc', 'f1_score'])
before

Unnamed: 0,precision,recall,roc_auc,f1_score
xgboost,0.798489,0.792155,0.861579,0.794659


**далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). 
Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть**

Тут полностью размеченный dataset. Представим, что нам неизвестны негативы и часть позитивов. 

In [74]:
mod_data = passengers_train_copy
# get the indices of the positives samples
pos_ind = np.where(mod_data['Survived'].values == 1)[0]
# shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')

pos_sample = pos_ind[:pos_sample_len]

Using 86/342 as positives and unlabeling the rest


In [75]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1

print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    805
 1     86
Name: class_test, dtype: int64


In [76]:
mod_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,class_test
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,-1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,-1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,-1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,-1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,-1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,-1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,-1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,-1


In [77]:
x_data = mod_data.iloc[:,:-1].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-1].values # original class

применить random negative sampling для построения классификатора в новых условиях


In [78]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]

print(neg_sample.shape, pos_sample.shape)

sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(86, 13) (86, 13)


In [79]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0))
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

In [80]:
columns_to_drop = ['Survived']
target = 'class_test'

X = sample_train.drop(columns=['Survived', 'Name', 'PassengerId', 'Ticket', 'Cabin'])
y = sample_train['Survived']

xgboost_precision_scores_after = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="precision_macro").mean()
xgboost_recall_scores_after = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="recall_macro").mean()
xgboost_roc_auc_scores_after = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="roc_auc").mean()
xgboost_f1_score_scores_after = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="f1_macro").mean()



In [81]:
data = {
    'xgboost': [
        np.mean(xgboost_precision_scores_after),
        np.mean(xgboost_recall_scores_after),
        np.mean(xgboost_roc_auc_scores_after),
        np.mean(xgboost_f1_score_scores_after)
    ]
}
after = pd.DataFrame.from_dict(data, orient='index', columns=['precision', 'recall', 'roc_auc', 'f1_score'])
after

Unnamed: 0,precision,recall,roc_auc,f1_score
xgboost,0.741373,0.740218,0.8345,0.739487


сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [82]:
before.append(after) 

Unnamed: 0,precision,recall,roc_auc,f1_score
xgboost,0.798489,0.792155,0.861579,0.794659
xgboost,0.741373,0.740218,0.8345,0.739487


поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

при увеличении доли позитивов качество модели растет