## Урок 6. Домашнее задание

In [684]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_score,\
                            classification_report, precision_recall_curve, confusion_matrix

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

Воспользуемся для анализа датасетом "Online Shoppers Purchasing Intention Dataset Data Set" (https://archive.ics.uci.edu/ml/datasets/Online+Shoppers+Purchasing+Intention+Dataset), который содержит информацию о онлайн поведении пользователей и о том, произошла ли покупка по итогам посещения онлайн-магазина.

In [685]:
df = pd.read_csv('online_shoppers_intention.csv')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


Отметим, что присутствуют как вещественные, так и категориальные признаки. Переведем целевую переменную, которая показывает произошла ли покупка, в числовой формат.

In [686]:
df['Revenue'] = np.where(df['Revenue'] == False, 0, 1)
df['Revenue'].value_counts()

0    10422
1     1908
Name: Revenue, dtype: int64

2. сделать feature engineering

In [687]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df, df['Revenue'], random_state=42)

In [688]:
continuous_columns = ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 
                       'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 
                       'SpecialDay']
categorical_columns = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']

Воспользуемся пайплайном с энкодингом категориальных признаков, который был представлен на лекциях ранее. Вещественные признаки оставляем неизменными.

In [689]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [690]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))
    
    
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

3. обучить любой классификатор (какой вам нравится)

Построим пайплайн на базе градиентного бустинга.

In [691]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', GradientBoostingClassifier(random_state = 42)),
])

In [692]:
#обучаем пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Month',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Month')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Month'))])),
                                                ('OperatingSystems',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='OperatingSystems')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='OperatingSystems'))])),
                                                ('Browser',
                                                 Pipeline(steps=[('sel

In [693]:
#построим предсказания для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.12024288, 0.3125914 , 0.64039381, 0.61526673, 0.31942637,
       0.01281751, 0.00934008, 0.01087183, 0.52131734, 0.02020328])

Найдем оптимальный порог, при котором достигается максимальное значение метрики `F1-score`.

In [694]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.309464, F-Score=0.682, Precision=0.638, Recall=0.732


In [695]:
print(f'ROC-AUC score={roc_auc_score(y_test, preds):.3f}')

ROC-AUC score=0.931


4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть.
5. применить random negative sampling для построения классификатора в новых условиях

Реализуем функцию, которая будет выбирать фиксированную долю _k_ наблюдений из положительного класса и переразмечать оставшуюся выборку. Затем будет производиться семплирование из нее, таким образом на выходе получаем обучающий и тестовый датасеты для обучения lookalike модели.

In [696]:
def create_datasets(df, k):
    Z = df.copy()
    # Индексы наблюдений с положительным классом
    pos_ind = list(Z[Z['Revenue'] == 1].index)
    # Перемешаем их для случайного семплирования
    np.random.shuffle(pos_ind)
    pos_sample_len = int(np.ceil(k * len(pos_ind)))
    pos_sample = pos_ind[:pos_sample_len]

    print(f'Используем {pos_sample_len}/{len(pos_ind)} положительных наблюдений,'
          f' оставшуюся часть выборки делаем неразмеченной.')
    
    Z['class'] = -1
    Z.loc[pos_sample,'class'] = 1
    
    # Засемплируем выборку перед созданием обучающего и тестового датасетов
    Z = Z.sample(frac=1)
    new_pos_sample_len = len(Z[Z['class']==1])
    # Семплируем неразмеченные наблюдения для создания обучающей выборки
    neg_sample = Z[Z['class']==-1][:new_pos_sample_len]
    pos_sample = Z[Z['class']==1]
    # Обучающая выборка из положительных и неразмеченных наблюдений
    Z_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    # Тестовая выборка из оставшихся наблюдений
    Z_test = Z[Z['class']==-1][new_pos_sample_len:]
    
    print(f'Объем обучающей выборки - {Z_train.shape[0]} наблюдений. '
          f'Объем тестовой выборки - {Z_test.shape[0]} наблюдений.')
    
    return Z_train, Z_test

In [697]:
Z_train, Z_test = create_datasets(df, k=0.25)

Используем 477/1908 положительных наблюдений, оставшуюся часть выборки делаем неразмеченной.
Объем обучающей выборки - 954 наблюдений. Объем тестовой выборки - 11376 наблюдений.


In [698]:
yy_train = Z_train['Revenue']
yy_test = Z_test['Revenue']

In [699]:
Z_train.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,class
11215,2,17.333333,0,0.0,89,3401.3,0.002222,0.014836,5.133912,0.0,Nov,2,2,1,2,Returning_Visitor,True,0,-1
6625,7,168.7,0,0.0,34,1062.704762,0.005405,0.01372,0.0,0.0,Sep,2,2,3,2,New_Visitor,False,0,-1
12081,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Nov,4,2,1,8,New_Visitor,False,0,-1
10757,0,0.0,0,0.0,9,87.916667,0.0,0.003175,0.0,0.0,Nov,8,2,5,1,Returning_Visitor,True,0,-1
8727,1,5.0,0,0.0,47,1480.233333,0.010417,0.027824,15.156438,0.0,Nov,2,2,1,3,Returning_Visitor,True,1,1


In [700]:
pipeline.fit(Z_train, yy_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Month',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Month')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Month'))])),
                                                ('OperatingSystems',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='OperatingSystems')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='OperatingSystems'))])),
                                                ('Browser',
                                                 Pipeline(steps=[('sel

In [701]:
#построим предсказания для тестовой выборки
new_preds = pipeline.predict_proba(Z_test)[:, 1]
new_preds[:10]

array([0.1352714 , 0.01467536, 0.08250655, 0.54775675, 0.0437458 ,
       0.03967769, 0.97264429, 0.2875926 , 0.30139959, 0.10084147])

In [702]:
new_precision, new_recall, new_thresholds = precision_recall_curve(yy_test, new_preds)

new_fscore = (2 * new_precision * new_recall) / (new_precision + new_recall)
# locate the index of the largest f score
new_ix = np.argmax(new_fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (new_thresholds[new_ix], 
                                                                        new_fscore[new_ix],
                                                                        new_precision[new_ix],
                                                                        new_recall[new_ix]))

Best Threshold=0.804652, F-Score=0.631, Precision=0.563, Recall=0.718


In [703]:
print(f'ROC-AUC score={roc_auc_score(yy_test, new_preds):.3f}')

ROC-AUC score=0.922


6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [704]:
metrics = pd.DataFrame([['GradientBoosting', precision[ix], recall[ix], fscore[ix], 
                         roc_auc_score(y_test, preds)], 
                        ['RandomNegativeSampling', new_precision[new_ix], new_recall[new_ix], 
                         new_fscore[new_ix], roc_auc_score(yy_test, new_preds)]], 
                       columns=['method', 'precision', 'recall', 'fscore', 'roc_auc'])
metrics

Unnamed: 0,method,precision,recall,fscore,roc_auc
0,GradientBoosting,0.638146,0.732106,0.681905,0.931355
1,RandomNegativeSampling,0.563492,0.71769,0.631312,0.922342


Все метрики оказались хуже, что вполне ожидаемо для данного подхода, так как обучающая выборка значительно меньше.

7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

##### доля положительных наблюдений - 10%

In [705]:
Z_train_10, Z_test_10 = create_datasets(df, k=0.1)
yy_train_10 = Z_train_10['Revenue']
yy_test_10 = Z_test_10['Revenue']
# обучаем пайплайн
pipeline.fit(Z_train_10, yy_train_10)

Используем 191/1908 положительных наблюдений, оставшуюся часть выборки делаем неразмеченной.
Объем обучающей выборки - 382 наблюдений. Объем тестовой выборки - 11948 наблюдений.


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Month',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Month')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Month'))])),
                                                ('OperatingSystems',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='OperatingSystems')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='OperatingSystems'))])),
                                                ('Browser',
                                                 Pipeline(steps=[('sel

In [706]:
Z_train_10.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,class
9740,2,136.5,0,0.0,8,1261.5,0.0,0.01,0.0,0.0,Dec,8,1,3,2,Returning_Visitor,True,0,-1
3037,3,40.0,0,0.0,19,1051.5,0.008333,0.030556,12.18,0.6,May,1,1,4,3,Returning_Visitor,False,1,1
8833,15,543.354907,7,243.283333,169,6920.973243,0.005269,0.01677,0.742943,0.0,Nov,1,1,3,2,Returning_Visitor,True,0,-1
2765,6,86.5,1,2.0,39,1050.708333,0.021163,0.018042,13.402983,0.0,May,3,2,6,3,Returning_Visitor,False,1,1
8293,0,0.0,0,0.0,53,2052.125,0.0,0.003922,52.49862,0.0,Nov,2,4,2,2,New_Visitor,True,1,1


In [707]:
new_preds_10 = pipeline.predict_proba(Z_test_10)[:, 1]
new_precision_10, new_recall_10, new_thresholds_10 = precision_recall_curve(yy_test_10, new_preds_10)

new_fscore_10 = (2 * new_precision_10 * new_recall_10) / (new_precision_10 + new_recall_10)
# locate the index of the largest f score
new_ix_10 = np.argmax(new_fscore_10)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (new_thresholds_10[new_ix_10], 
                                                                        new_fscore_10[new_ix_10],
                                                                        new_precision_10[new_ix_10],
                                                                        new_recall_10[new_ix_10]))
print(f'ROC-AUC score={roc_auc_score(yy_test_10, new_preds_10):.3f}')

Best Threshold=0.779968, F-Score=0.618, Precision=0.537, Recall=0.728
ROC-AUC score=0.898


##### доля положительных наблюдений - 40%

In [708]:
Z_train_40, Z_test_40 = create_datasets(df, k=0.4)
yy_train_40 = Z_train_40['Revenue']
yy_test_40 = Z_test_40['Revenue']
# обучаем пайплайн
pipeline.fit(Z_train_40, yy_train_40)

Используем 764/1908 положительных наблюдений, оставшуюся часть выборки делаем неразмеченной.
Объем обучающей выборки - 1528 наблюдений. Объем тестовой выборки - 10802 наблюдений.


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Month',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Month')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Month'))])),
                                                ('OperatingSystems',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='OperatingSystems')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='OperatingSystems'))])),
                                                ('Browser',
                                                 Pipeline(steps=[('sel

In [709]:
Z_train_40.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,class
601,6,99.605882,0,0.0,83,2585.61835,0.0,0.007619,6.871309,0.0,Mar,2,2,2,2,Returning_Visitor,False,1,1
8739,3,135.5,0,0.0,30,2040.166667,0.0,0.022727,4.350514,0.0,Dec,2,2,8,2,Returning_Visitor,False,1,1
5821,0,0.0,0,0.0,29,1970.0,0.0,0.0,100.725605,0.0,Oct,4,1,5,2,Returning_Visitor,False,1,1
11127,6,145.0,0,0.0,85,1704.580556,0.00293,0.025399,0.0,0.0,Nov,1,2,3,2,Returning_Visitor,False,1,1
11935,0,0.0,0,0.0,18,353.083333,0.022222,0.05,0.0,0.0,Nov,1,1,1,2,Returning_Visitor,True,1,1


In [710]:
new_preds_40 = pipeline.predict_proba(Z_test_40)[:, 1]
new_precision_40, new_recall_40, new_thresholds_40 = precision_recall_curve(yy_test_40, new_preds_40)

new_fscore_40 = (2 * new_precision_40 * new_recall_40) / (new_precision_40 + new_recall_40)
# locate the index of the largest f score
new_ix_40 = np.argmax(new_fscore_40)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (new_thresholds_40[new_ix_40], 
                                                                        new_fscore_40[new_ix_40],
                                                                        new_precision_40[new_ix_40],
                                                                        new_recall_40[new_ix_40]))
print(f'ROC-AUC score={roc_auc_score(yy_test_40, new_preds_40):.3f}')

Best Threshold=0.883352, F-Score=0.608, Precision=0.582, Recall=0.636
ROC-AUC score=0.928


In [711]:
metrics = metrics.append(pd.DataFrame([['RandomNegativeSampling-10%', new_precision_10[new_ix_10], new_recall_10[new_ix_10], 
                              new_fscore_10[new_ix_10], roc_auc_score(yy_test_10, new_preds_10)],
                             ['RandomNegativeSampling-40%', new_precision_40[new_ix_40], new_recall_40[new_ix_40], 
                              new_fscore_40[new_ix_40], roc_auc_score(yy_test_40, new_preds_40)]], 
                            columns=['method', 'precision', 'recall', 'fscore', 'roc_auc']), ignore_index=True)
metrics

Unnamed: 0,method,precision,recall,fscore,roc_auc
0,GradientBoosting,0.638146,0.732106,0.681905,0.931355
1,RandomNegativeSampling,0.563492,0.71769,0.631312,0.922342
2,RandomNegativeSampling-10%,0.537255,0.728293,0.618355,0.898129
3,RandomNegativeSampling-40%,0.582333,0.636364,0.60815,0.927576


Видим, что увеличение доли положительных наблюдений, которая семплируется в начале построения модели RandomNegativeSampling, ведет к увеличению обучающей выборки и к более высоким значениям `ROC-AUC score`. Для других метрик динамика не столь однозначна. Например, как видим при увеличении доли положительных наблюдений вырос показатель точности, но снизился показатель полноты.