In [1]:
#Importing libraries
from typing import List, Tuple
import numpy as np
import pandas as pd
from typing import List, Optional
from tqdm import tqdm
from sklearn.utils.validation import check_is_fitted
import xgboost as xgb
import catboost as catb
from catboost.utils import get_roc_curve
from catboost import Pool
import lightgbm as lgb

import os
import json
import shap
import dill
import matplotlib.pyplot as plt
from collections import Counter
from PIL import  Image
import missingno as msno
%matplotlib inline

import seaborn as sns #visualization
import featuretools as ft
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization

from featexp import get_univariate_plots

from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold, StratifiedKFold, GroupKFold
from scipy.stats import shapiro, probplot, ttest_ind, mannwhitneyu, chi2_contingency, ks_2samp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, scorer, log_loss
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, f1_score, cohen_kappa_score
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder as le
from pylab import rcParams

rcParams['figure.figsize'] = 8, 5
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 400)
plt.style.use('seaborn-whitegrid')
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
def show_feature_importances(feature_names, feature_importances, get_top=None):
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
    feature_importances = feature_importances.sort_values('importance', ascending=False)
       
    plt.figure(figsize = (20, len(feature_importances) * 0.5))
    
    sns.barplot(feature_importances['importance'], feature_importances['feature'])
    
    plt.xlabel('Importance')
    plt.title('Importance of features')
    plt.show()
    
    if get_top is not None:
        return feature_importances['feature'][:get_top].tolist()

In [3]:
def plot_roc_curve(**kwargs):
    res_lst = []
    
    for k, v in kwargs.items():  
        fpr, tpr, _ = roc_curve(v[0], v[1])
        res_lst.append((fpr, tpr, k))
    
    plt.figure(figsize=(20,10))
    for res in res_lst:
        plt.plot(res[0], res[1], label=res[2])
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend()
    plt.show()
    
    return res_lst

In [4]:
#Загружаю частично обработанный мною от пропусков в ДЗ2 датасет:
# train = pd.read_csv("assignment2_data/assignment_2_train.csv")
train = pd.read_csv("assignment2_data/train2.csv")
lb = pd.read_csv("assignment2_data/assignment_2_test.csv")

print("train.shape = {} rows, {} cols".format(*train.shape))
print("train.shape = {} rows, {} cols".format(*lb.shape))

train.shape = 179977 rows, 398 cols
train.shape = 100001 rows, 394 cols


In [5]:
#Разбиваем признаки по типам: числовые, нечисловые, цель
num_features = train.select_dtypes("number").columns.to_list()
dum_features = train.select_dtypes("object").columns.to_list()
target = num_features.pop(1)
len(num_features), len(dum_features), target

(379, 18, 'isFraud')

In [6]:
def make_cross_validation(X: pd.DataFrame,
                          y: pd.Series,
                          estimator: object,
                          metric: callable,
                          cv_strategy,
                          params,
                          groups: pd.Series = pd.Series()):
    """
    Кросс-валидация.

    Parameters
    ----------
    X: pd.DataFrame
        Матрица признаков.

    y: pd.Series
        Вектор целевой переменной.

    estimator: callable
        Объект модели для обучения.

    metric: callable
        Метрика для оценки качества решения.
        Ожидается, что на вход будет передана функция,
        которая принимает 2 аргумента: y_true, y_pred.

    cv_strategy: cross-validation generator
        Объект для описания стратегии кросс-валидации.
        Ожидается, что на вход будет передан объект типа
        KFold или StratifiedKFold.
    groups:
        Если в cv_strategy передаем GroupKFold, то нужно передать groups,
        чтобы разделять по этим данным датасет на группы

    Returns
    -------
    oof_score: float
        Значение метрики качества на OOF-прогнозах.

    fold_train_scores: List[float]
        Значение метрики качества на каждом обучающем датасете кросс-валидации.

    fold_valid_scores: List[float]
        Значение метрики качества на каждом валидационном датасете кросс-валидации.

    oof_predictions: np.array
        Прогнозы на OOF.

    """
    estimators, fold_train_scores, fold_valid_scores = [], [], []
    oof_predictions = np.zeros(X.shape[0])
    if len(groups) > 0:
        cv_generator = cv_strategy.split(X, y, groups)
    else:
        cv_generator = cv_strategy.split(X, y)

    for fold_number, (train_idx, valid_idx) in enumerate(cv_generator):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
        
        dtrain = xgb.DMatrix(data=x_train, label=y_train)
        dvalid = xgb.DMatrix(data=x_valid, label=y_valid)
        
        model_train = estimator.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            early_stopping_rounds=50,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            verbose_eval=False,
            maximize=True,
        )
        y_train_pred = model_train.predict(dtrain)
        y_valid_pred = model_train.predict(dvalid)

        fold_train_scores.append(metric(y_train, y_train_pred))
        fold_valid_scores.append(metric(y_valid, y_valid_pred))
        oof_predictions[valid_idx] = y_valid_pred

        msg = (
            f"Fold: {fold_number+1}, train-observations = {len(train_idx)}, "
            f"valid-observations = {len(valid_idx)}\n"
            f"train-score = {round(fold_train_scores[fold_number], 4)}, "
            f"valid-score = {round(fold_valid_scores[fold_number], 4)}" 
        )
        print(msg)
        print("="*69)
        estimators.append(model_train)

    oof_score = metric(y, oof_predictions)
    print(f"CV-results train: {round(np.mean(fold_train_scores), 4)} +/- {round(np.std(fold_train_scores), 3)}")
    print(f"CV-results valid: {round(np.mean(fold_valid_scores), 4)} +/- {round(np.std(fold_valid_scores), 3)}")
    print(f"OOF-score = {round(oof_score, 4)}")

    return estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions

__Задание 0:__ выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.

- Сделаю валидацию GroupKFold, которая показала самые надежные метрики качества в 3-м ДЗ: 

In [7]:
card_cols = ['card1', 'card4', 'card6']
all_groupby = train.groupby(card_cols, as_index=False).agg({'TransactionID': 'count'})
all_groupby['user_id'] = all_groupby.index
all_groupby.drop('TransactionID', axis=1, inplace=True)
all_groupby.head(2)

Unnamed: 0,card1,card4,card6,user_id
0,1001,visa,debit,0
1,1004,visa,credit,1


In [8]:
train2 = pd.merge(train, all_groupby, on=card_cols, how='left')

#### Подготовка лидерборда для получения оценок метрики качества на нем:

Добавляю новые признаки в датасет лидерборда для получения результатов на нем:

In [9]:
lb.loc[lb['P_emaildomain']=='gmail', 'P_emaildomain'] = 'gmail.com'
lb.loc[lb['R_emaildomain']=='gmail', 'R_emaildomain'] = 'gmail.com'
lb['P_emaildomain'].fillna('unknown', inplace=True)
lb['R_emaildomain'].fillna('unknown', inplace=True)

lb['P_emaildomain_zone'] = lb['P_emaildomain'].apply(
    lambda x: '.'.join(x.split('.')[1:]) if x!='unknown' else 'unknown'
)
lb['R_emaildomain_zone'] = lb['R_emaildomain'].apply(
    lambda x: '.'.join(x.split('.')[1:]) if x!='unknown' else 'unknown'
)
lb['P_emaildomain_domain'] = lb['P_emaildomain'].apply(lambda x: x.split('.')[0])
lb['R_emaildomain_domain'] = lb['R_emaildomain'].apply(lambda x: x.split('.')[0])
print("train.shape = {} rows, {} cols".format(*lb.shape))

train.shape = 100001 rows, 398 cols


In [10]:
dum_na = lb[dum_features].isnull().sum()
col_na_lst = dum_na.loc[dum_na > 0].index.tolist()
lb[col_na_lst] = lb[col_na_lst].fillna('unknown')
lb[dum_features].isnull().sum()

ProductCD               0
card4                   0
card6                   0
P_emaildomain           0
R_emaildomain           0
M1                      0
M2                      0
M3                      0
M4                      0
M5                      0
M6                      0
M7                      0
M8                      0
M9                      0
P_emaildomain_zone      0
R_emaildomain_zone      0
P_emaildomain_domain    0
R_emaildomain_domain    0
dtype: int64

In [11]:
all_groupby = lb.groupby(card_cols, as_index=False).agg({'TransactionID': 'count'})
all_groupby['user_id'] = all_groupby.index
all_groupby.drop('TransactionID', axis=1, inplace=True)
lb_train_df = pd.merge(lb, all_groupby, on=card_cols, how='left')

Датасет лидерборда для моделей:

In [12]:
y_lb = lb_train_df[target]
lb_train = lb_train_df[num_features + dum_features + ['user_id']].apply(le().fit_transform)
dleaderboard = xgb.DMatrix(
    data=lb_train, label=y_lb
)

- Сделаем кроссвалидацию:

In [13]:
data = train2[num_features + dum_features + ['user_id']].apply(le().fit_transform)
y = train2[target]
groups = train2['user_id']

In [14]:
cv_strategy = GroupKFold(n_splits=5)

In [15]:
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
#     "n_estimators": 1000,
    "reg_lambda": 100,
    "max_depth": 4,
    "gamma": 10,
    "nthread": 6,
    "seed": 27
}

In [16]:
%%time
estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions = make_cross_validation(
    data, y, xgb, metric=roc_auc_score, cv_strategy=cv_strategy, params=params, groups=groups
)

Fold: 1, train-observations = 143981, valid-observations = 35996
train-score = 0.9227, valid-score = 0.8886
Fold: 2, train-observations = 143981, valid-observations = 35996
train-score = 0.9258, valid-score = 0.88
Fold: 3, train-observations = 143982, valid-observations = 35995
train-score = 0.9198, valid-score = 0.8916
Fold: 4, train-observations = 143982, valid-observations = 35995
train-score = 0.9189, valid-score = 0.8908
Fold: 5, train-observations = 143982, valid-observations = 35995
train-score = 0.923, valid-score = 0.8857
CV-results train: 0.922 +/- 0.002
CV-results valid: 0.8873 +/- 0.004
OOF-score = 0.8876
Wall time: 18min 53s


In [17]:
lb_roc_auc_lst2 = []
for est in estimators:
    y_lb_pred = est.predict(dleaderboard)
    lb_roc_auc_lst2.append(roc_auc_score(y_lb, y_lb_pred))
print(lb_roc_auc_lst2)
print('Leaderboard score', round(np.mean(lb_roc_auc_lst2), 4),  '+/-', round(np.std(lb_roc_auc_lst2), 3))

[0.8581529734586558, 0.8599733635128151, 0.8593279332029686, 0.855988854179111, 0.8585331642380771]
Leaderboard score 0.8584 +/- 0.001


- Результат по лидерборду ниже, чем дает кросс-валидация по valid фолдам и по OOF
- Но результат очень стабильный - разброс всего 0.001

__Задание 1:__ признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [18]:
from datetime import datetime

In [19]:
train2.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,P_emaildomain_zone,R_emaildomain_zone,P_emaildomain_domain,R_emaildomain_domain,user_id
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,unknown,unknown,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,unknown,unknown,unknown,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,unknown,unknown,unknown,unknown,7077
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,unknown,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,unknown,unknown,unknown,M0,T,T,unknown,unknown,unknown,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,com,unknown,gmail,unknown,957


In [20]:
basesec = datetime(2017,12,1).timestamp()
train2['TransactionDT_date'] = train2['TransactionDT'].apply(lambda x: datetime.fromtimestamp(basesec+x))

In [21]:
train2['TransactionDT_year'] = train2['TransactionDT_date'].dt.year
train2['TransactionDT_month'] = train2['TransactionDT_date'].dt.month
train2['TransactionDT_day'] = train2['TransactionDT_date'].dt.day
train2['TransactionDT_hour'] = train2['TransactionDT_date'].dt.hour
train2['TransactionDT_dayofweek'] = train2['TransactionDT_date'].dt.dayofweek

In [22]:
#Разбиваем признаки по типам: числовые, нечисловые, цель
train2_num_features = train2.select_dtypes("number").columns.to_list()
train2_dum_features = train2.select_dtypes("object").columns.to_list()
train2_target = train2_num_features.pop(1)
len(train2_num_features), len(train2_dum_features), train2_target

(385, 18, 'isFraud')

In [23]:
data2 = train2[train2_num_features + train2_dum_features].apply(le().fit_transform)
y = train2[target]
groups = train2['user_id']

- Добавим новые признаки в датасет лидерборда:

In [24]:
lb_train_df2 = lb_train_df.copy()

In [25]:
lb_train_df2['TransactionDT_date'] = lb_train_df2['TransactionDT'].apply(lambda x: datetime.fromtimestamp(basesec+x))

In [26]:
lb_train_df2['TransactionDT_year'] = lb_train_df2['TransactionDT_date'].dt.year
lb_train_df2['TransactionDT_month'] = lb_train_df2['TransactionDT_date'].dt.month
lb_train_df2['TransactionDT_day'] = lb_train_df2['TransactionDT_date'].dt.day
lb_train_df2['TransactionDT_hour'] = lb_train_df2['TransactionDT_date'].dt.hour
lb_train_df2['TransactionDT_dayofweek'] = lb_train_df2['TransactionDT_date'].dt.dayofweek

In [27]:
lb_train2 = lb_train_df2[train2_num_features + train2_dum_features].apply(le().fit_transform)
dleaderboard2 = xgb.DMatrix(
    data=lb_train2, label=y_lb
)

- Сделаем кросс-валидацию:

In [28]:
%%time
estimators2, oof_score2, fold_train_scores2, fold_valid_scores2, oof_predictions2 = make_cross_validation(
    data2, y, xgb, metric=roc_auc_score, cv_strategy=cv_strategy, params=params, groups=groups
)

Fold: 1, train-observations = 143981, valid-observations = 35996
train-score = 0.9216, valid-score = 0.8867
Fold: 2, train-observations = 143981, valid-observations = 35996
train-score = 0.9248, valid-score = 0.8796
Fold: 3, train-observations = 143982, valid-observations = 35995
train-score = 0.9222, valid-score = 0.8905
Fold: 4, train-observations = 143982, valid-observations = 35995
train-score = 0.9193, valid-score = 0.8903
Fold: 5, train-observations = 143982, valid-observations = 35995
train-score = 0.9226, valid-score = 0.8862
CV-results train: 0.9221 +/- 0.002
CV-results valid: 0.8867 +/- 0.004
OOF-score = 0.8869
Wall time: 18min 56s


In [29]:
lb_roc_auc_lst2 = []
for est in estimators2:
    y_lb_pred = est.predict(dleaderboard2)
    lb_roc_auc_lst2.append(roc_auc_score(y_lb, y_lb_pred))
print(lb_roc_auc_lst2)
print('Leaderboard score', round(np.mean(lb_roc_auc_lst2), 4),  '+/-', round(np.std(lb_roc_auc_lst2), 3))

[0.8583522911552415, 0.8594135676408954, 0.8574208060511822, 0.8563526594627107, 0.8594577368818831]
Leaderboard score 0.8582 +/- 0.001


- Результат по лидерборду незначительно ухудшился - на 0.002.

__Задание 2:__ сделать конкатенацию признаков
* card1 + card2;
* card1 + card2 + card3 + card5;
* card1 + card2 + card3 + card5 + addr1 + addr2

Рассматривать их как категориальных признаки.

In [30]:
#Кол-во пропусков в данных:
col_lst = ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']
train2[col_lst].isna().sum()

card1        0
card2     2603
card3        0
card5      938
addr1    19429
addr2    19429
dtype: int64

In [31]:
#Кол-во уникальных значений в данных
train2[col_lst].nunique()

card1    9488
card2     499
card3      88
card5      94
addr1     269
addr2      54
dtype: int64

- Заполним пропуски данных значением -999

In [32]:
train3 = train2.copy()
train3[col_lst] = train3[col_lst].fillna(-999)
train3[col_lst].isna().sum()

card1    0
card2    0
card3    0
card5    0
addr1    0
addr2    0
dtype: int64

In [33]:
train3['cards1'] = train3['card1'].astype(str) + train3['card2'].astype(str)
train3['cards2'] = train3['card1'].astype(str) + train3['card2'].astype(str) + train3[
    'card3'
].astype(str) + train3['card5'].astype(str)
train3['cards3'] = train3['card1'].astype(str) + train3['card2'].astype(str) + train3[
    'card3'
].astype(str) + train3['card5'].astype(str) + train3['addr1'].astype(str) + train3['addr2'].astype(str)

In [34]:
#Разбиваем признаки по типам: числовые, нечисловые, цель
train3_num_features = train3.select_dtypes("number").columns.to_list()
train3_dum_features = train3.select_dtypes("object").columns.to_list()
train3_target = train3_num_features.pop(1)
len(train3_num_features), len(train3_dum_features), train3_target

(385, 21, 'isFraud')

In [35]:
data3 = train3[train3_num_features + train3_dum_features].apply(le().fit_transform)
y = train3[target]
groups = train3['user_id']

- Добавим новые признаки в датасет лидерборда:

In [36]:
lb_train_df3 = lb_train_df2.copy()

In [37]:
lb_train_df3[col_lst] = lb_train_df3[col_lst].fillna(-999)
lb_train_df3['cards1'] = lb_train_df3['card1'].astype(str) + lb_train_df3['card2'].astype(str)
lb_train_df3['cards2'] = lb_train_df3['card1'].astype(str) + lb_train_df3['card2'].astype(str) + lb_train_df3[
    'card3'
].astype(str) + lb_train_df3['card5'].astype(str)
lb_train_df3['cards3'] = lb_train_df3['card1'].astype(str) + lb_train_df3['card2'].astype(str) + lb_train_df3[
    'card3'
].astype(str) + lb_train_df3['card5'].astype(str) + lb_train_df3['addr1'].astype(str) + lb_train_df3['addr2'].astype(str)

In [38]:
lb_train_df3[col_lst].isna().sum()

card1    0
card2    0
card3    0
card5    0
addr1    0
addr2    0
dtype: int64

In [39]:
lb_train3 = lb_train_df3[train3_num_features + train3_dum_features].apply(le().fit_transform)
dleaderboard3 = xgb.DMatrix(
    data=lb_train3, label=y_lb
)

- Сделаем кросс-валидацию:

In [40]:
%%time
estimators3, oof_score3, fold_train_scores3, fold_valid_scores3, oof_predictions3 = make_cross_validation(
    data3, y, xgb, metric=roc_auc_score, cv_strategy=cv_strategy, params=params, groups=groups
)

Fold: 1, train-observations = 143981, valid-observations = 35996
train-score = 0.9218, valid-score = 0.8886
Fold: 2, train-observations = 143981, valid-observations = 35996
train-score = 0.9282, valid-score = 0.8821
Fold: 3, train-observations = 143982, valid-observations = 35995
train-score = 0.9247, valid-score = 0.8931
Fold: 4, train-observations = 143982, valid-observations = 35995
train-score = 0.9241, valid-score = 0.8937
Fold: 5, train-observations = 143982, valid-observations = 35995
train-score = 0.9236, valid-score = 0.8848
CV-results train: 0.9245 +/- 0.002
CV-results valid: 0.8885 +/- 0.005
OOF-score = 0.8886
Wall time: 20min 52s


In [41]:
lb_roc_auc_lst3 = []
for est in estimators3:
    y_lb_pred = est.predict(dleaderboard3)
    lb_roc_auc_lst3.append(roc_auc_score(y_lb, y_lb_pred))
print(lb_roc_auc_lst3)
print('Leaderboard score', round(np.mean(lb_roc_auc_lst3), 4),  '+/-', round(np.std(lb_roc_auc_lst3), 3))

[0.8607876278291391, 0.8599207870039998, 0.8593882419933762, 0.8577212733045283, 0.8590504564726956]
Leaderboard score 0.8594 +/- 0.001


- Результат по лидерборду стал лучше +0.0012 с сохранением разброса

__Задание 3:__ Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.

In [42]:
col_lst = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']
train3[col_lst].isna().sum()

card1    0
card2    0
card3    0
card4    0
card5    0
card6    0
addr1    0
addr2    0
dtype: int64

In [43]:
train4 = train3.copy()
for col in col_lst:
    freq_encoder = train4[col].value_counts(normalize=True)
    train4[f"{col}_freq_enc"] = train4[col].map(freq_encoder)

In [44]:
#Разбиваем признаки по типам: числовые, нечисловые, цель
train4_num_features = train4.select_dtypes("number").columns.to_list()
train4_dum_features = train4.select_dtypes("object").columns.to_list()
train4_target = train4_num_features.pop(1)
len(train4_num_features), len(train4_dum_features), train4_target

(393, 21, 'isFraud')

In [45]:
data4 = train4[train4_num_features + train4_dum_features].apply(le().fit_transform)
y = train4[target]
groups = train4['user_id']

- Добавим новые признаки в датасет лидерборда:

In [46]:
lb_train_df4 = lb_train_df3.copy()
lb_train_df4[col_lst].isna().sum()

card1    0
card2    0
card3    0
card4    0
card5    0
card6    0
addr1    0
addr2    0
dtype: int64

In [47]:
for col in col_lst:
    freq_encoder = lb_train_df4[col].value_counts(normalize=True)
    lb_train_df4[f"{col}_freq_enc"] = lb_train_df4[col].map(freq_encoder)

In [48]:
lb_train4 = lb_train_df4[train4_num_features + train4_dum_features].apply(le().fit_transform)
dleaderboard4 = xgb.DMatrix(
    data=lb_train4, label=y_lb
)

- Сделаем кросс-валидацию:

In [49]:
%%time
estimators4, oof_score4, fold_train_scores4, fold_valid_scores4, oof_predictions4 = make_cross_validation(
    data4, y, xgb, metric=roc_auc_score, cv_strategy=cv_strategy, params=params, groups=groups
)

Fold: 1, train-observations = 143981, valid-observations = 35996
train-score = 0.9267, valid-score = 0.8862
Fold: 2, train-observations = 143981, valid-observations = 35996
train-score = 0.9285, valid-score = 0.8813
Fold: 3, train-observations = 143982, valid-observations = 35995
train-score = 0.9234, valid-score = 0.8911
Fold: 4, train-observations = 143982, valid-observations = 35995
train-score = 0.9209, valid-score = 0.8933
Fold: 5, train-observations = 143982, valid-observations = 35995
train-score = 0.9236, valid-score = 0.8903
CV-results train: 0.9246 +/- 0.003
CV-results valid: 0.8884 +/- 0.004
OOF-score = 0.8885
Wall time: 18min 41s


In [50]:
lb_roc_auc_lst4 = []
for est in estimators4:
    y_lb_pred = est.predict(dleaderboard4)
    lb_roc_auc_lst4.append(roc_auc_score(y_lb, y_lb_pred))
print(lb_roc_auc_lst4)
print('Leaderboard score', round(np.mean(lb_roc_auc_lst4), 4),  '+/-', round(np.std(lb_roc_auc_lst4), 3))

[0.8608607258324295, 0.8585465273258968, 0.8562135330488072, 0.8575282532021921, 0.8539724912994274]
Leaderboard score 0.8574 +/- 0.002


- Результат по лидерборду стал хуже, а также увеличился разброс. На валидационной выборке изменений практически нет.

__Задание 4:__ Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [51]:
train5 = train4.copy()
col_lst = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'cards1', 'cards2', 'cards3']

In [52]:
for col in col_lst:
    mean_tamt = train5.groupby(col)['TransactionAmt'].mean()
    std_tamt = train5.groupby(col)['TransactionAmt'].std()
    train5[f"{col}_TransactionAmt_mean"] = train5['TransactionAmt'] / train5[col].map(mean_tamt)
    train5[f"{col}_TransactionAmt_std"] = train5['TransactionAmt'] / train5[col].map(std_tamt)

In [53]:
#Разбиваем признаки по типам: числовые, нечисловые, цель
train5_num_features = train5.select_dtypes("number").columns.to_list()
train5_dum_features = train5.select_dtypes("object").columns.to_list()
train5_target = train5_num_features.pop(1)
len(train5_num_features), len(train5_dum_features), train5_target

(415, 21, 'isFraud')

In [54]:
data5 = train5[train5_num_features + train5_dum_features].apply(le().fit_transform)
y = train5[target]
groups = train5['user_id']

- Добавим новые признаки в датасет лидерборда:

In [55]:
lb_train_df5 = lb_train_df4.copy()

In [56]:
for col in col_lst:
    mean_tamt = lb_train_df5.groupby(col)['TransactionAmt'].mean()
    std_tamt = lb_train_df5.groupby(col)['TransactionAmt'].std()
    lb_train_df5[f"{col}_TransactionAmt_mean"] = lb_train_df5['TransactionAmt'] / lb_train_df5[col].map(mean_tamt)
    lb_train_df5[f"{col}_TransactionAmt_std"] = lb_train_df5['TransactionAmt'] / lb_train_df5[col].map(std_tamt)

In [57]:
lb_train5 = lb_train_df5[train5_num_features + train5_dum_features].apply(le().fit_transform)
dleaderboard5 = xgb.DMatrix(
    data=lb_train5, label=y_lb
)

- Сделаем кросс-валидацию:

In [58]:
%%time
estimators5, oof_score5, fold_train_scores5, fold_valid_scores5, oof_predictions5 = make_cross_validation(
    data5, y, xgb, metric=roc_auc_score, cv_strategy=cv_strategy, params=params, groups=groups
)

Fold: 1, train-observations = 143981, valid-observations = 35996
train-score = 0.9258, valid-score = 0.8862
Fold: 2, train-observations = 143981, valid-observations = 35996
train-score = 0.9281, valid-score = 0.8809
Fold: 3, train-observations = 143982, valid-observations = 35995
train-score = 0.9237, valid-score = 0.8917
Fold: 4, train-observations = 143982, valid-observations = 35995
train-score = 0.9247, valid-score = 0.8949
Fold: 5, train-observations = 143982, valid-observations = 35995
train-score = 0.9257, valid-score = 0.89
CV-results train: 0.9256 +/- 0.001
CV-results valid: 0.8887 +/- 0.005
OOF-score = 0.8889
Wall time: 20min 39s


In [59]:
lb_roc_auc_lst5 = []
for est in estimators5:
    y_lb_pred = est.predict(dleaderboard5)
    lb_roc_auc_lst5.append(roc_auc_score(y_lb, y_lb_pred))
print(lb_roc_auc_lst5)
print('Leaderboard score', round(np.mean(lb_roc_auc_lst5), 4),  '+/-', round(np.std(lb_roc_auc_lst5), 3))

[0.8621245446452674, 0.8595120377482472, 0.8567764620046094, 0.8564390194425222, 0.8571764022800783]
Leaderboard score 0.8584 +/- 0.002


- Результат по лидерборду улучшился до уровня, который был получен в задании 1

__Задание 5:__ Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [60]:
train6 = train5.copy()

In [61]:
#В признаке много пропусков, но обрабатывать пока не буду.
train6.D15.isna().sum()

48809

In [62]:
for col in col_lst:
    mean_tamt = train6.groupby(col)['D15'].mean()
    std_tamt = train6.groupby(col)['D15'].std()
    train6[f"{col}_D15_mean"] = train6['D15'] / train6[col].map(mean_tamt)
    train6[f"{col}_D15_std"] = train6['D15'] / train6[col].map(std_tamt)

In [63]:
#Разбиваем признаки по типам: числовые, нечисловые, цель
train6_num_features = train6.select_dtypes("number").columns.to_list()
train6_dum_features = train6.select_dtypes("object").columns.to_list()
train6_target = train6_num_features.pop(1)
len(train6_num_features), len(train6_dum_features), train6_target

(437, 21, 'isFraud')

In [64]:
data6 = train6[train6_num_features + train6_dum_features].apply(le().fit_transform)
y = train6[target]
groups = train6['user_id']

- Добавим новые признаки в датасет лидерборда:

In [65]:
col_lst = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'cards1', 'cards2', 'cards3']
lb_train_df6 = lb_train_df5.copy()

In [66]:
for col in col_lst:
    mean_tamt = lb_train_df6.groupby(col)['D15'].mean()
    std_tamt = lb_train_df6.groupby(col)['D15'].std()
    lb_train_df6[f"{col}_D15_mean"] = lb_train_df6['D15'] / lb_train_df6[col].map(mean_tamt)
    lb_train_df6[f"{col}_D15_std"] = lb_train_df6['D15'] / lb_train_df6[col].map(std_tamt)

In [67]:
lb_train6 = lb_train_df6[train6_num_features + train6_dum_features].apply(le().fit_transform)
dleaderboard6 = xgb.DMatrix(
    data=lb_train6, label=y_lb
)

- Сделаем кросс-валидацию:

In [68]:
%%time
estimators6, oof_score6, fold_train_scores6, fold_valid_scores6, oof_predictions6 = make_cross_validation(
    data6, y, xgb, metric=roc_auc_score, cv_strategy=cv_strategy, params=params, groups=groups
)

Fold: 1, train-observations = 143981, valid-observations = 35996
train-score = 0.9256, valid-score = 0.8841
Fold: 2, train-observations = 143981, valid-observations = 35996
train-score = 0.931, valid-score = 0.8811
Fold: 3, train-observations = 143982, valid-observations = 35995
train-score = 0.9248, valid-score = 0.8893
Fold: 4, train-observations = 143982, valid-observations = 35995
train-score = 0.926, valid-score = 0.8954
Fold: 5, train-observations = 143982, valid-observations = 35995
train-score = 0.9253, valid-score = 0.8891
CV-results train: 0.9265 +/- 0.002
CV-results valid: 0.8878 +/- 0.005
OOF-score = 0.8877
Wall time: 25min 30s


In [69]:
lb_roc_auc_lst6 = []
for est in estimators6:
    y_lb_pred = est.predict(dleaderboard6)
    lb_roc_auc_lst6.append(roc_auc_score(y_lb, y_lb_pred))
print(lb_roc_auc_lst6)
print('Leaderboard score', round(np.mean(lb_roc_auc_lst6), 4),  '+/-', round(np.std(lb_roc_auc_lst6), 3))

[0.8618038059429548, 0.859794365088107, 0.8589981669013488, 0.8583505435694214, 0.858302258825314]
Leaderboard score 0.8594 +/- 0.001


- Результат по лидерборду улучшился до уровня, который был получен в задании 2

__Задание 6:__ выделить дробную часть и целую часть признака TransactionAmt в два отдельных признака. После создать отдельных признак - логарифм от TransactionAmt

In [70]:
train7 = train6.copy()

In [71]:
train7['TransactionAmt_int'] = train7['TransactionAmt'].astype(int)
train7['TransactionAmt_frac'] = train7['TransactionAmt'] - train7['TransactionAmt_int']
train7['TransactionAmt_log'] = np.log(train7['TransactionAmt_int'])

In [72]:
#Разбиваем признаки по типам: числовые, нечисловые, цель
train7_num_features = train7.select_dtypes("number").columns.to_list()
train7_dum_features = train7.select_dtypes("object").columns.to_list()
train7_target = train7_num_features.pop(1)
len(train7_num_features), len(train7_dum_features), train7_target

(440, 21, 'isFraud')

In [73]:
data7 = train7[train7_num_features + train7_dum_features].apply(le().fit_transform)
y = train7[target]
groups = train7['user_id']

- Добавим новые признаки в датасет лидерборда:

In [74]:
lb_train_df7 = lb_train_df6.copy()

In [75]:
lb_train_df7['TransactionAmt_int'] = lb_train_df7['TransactionAmt'].astype(int)
lb_train_df7['TransactionAmt_frac'] = lb_train_df7['TransactionAmt'] - lb_train_df7['TransactionAmt_int']
lb_train_df7['TransactionAmt_log'] = np.log(lb_train_df7['TransactionAmt_int'])

In [76]:
lb_train7 = lb_train_df7[train7_num_features + train7_dum_features].apply(le().fit_transform)
dleaderboard7 = xgb.DMatrix(
    data=lb_train7, label=y_lb
)

- Сделаем кросс-валидацию:

In [77]:
%%time
estimators7, oof_score7, fold_train_scores7, fold_valid_scores7, oof_predictions7 = make_cross_validation(
    data7, y, xgb, metric=roc_auc_score, cv_strategy=cv_strategy, params=params, groups=groups
)

Fold: 1, train-observations = 143981, valid-observations = 35996
train-score = 0.9276, valid-score = 0.8859
Fold: 2, train-observations = 143981, valid-observations = 35996
train-score = 0.93, valid-score = 0.8814
Fold: 3, train-observations = 143982, valid-observations = 35995
train-score = 0.9298, valid-score = 0.8902
Fold: 4, train-observations = 143982, valid-observations = 35995
train-score = 0.9248, valid-score = 0.8963
Fold: 5, train-observations = 143982, valid-observations = 35995
train-score = 0.9264, valid-score = 0.8895
CV-results train: 0.9277 +/- 0.002
CV-results valid: 0.8887 +/- 0.005
OOF-score = 0.8886
Wall time: 23min 46s


In [78]:
lb_roc_auc_lst7 = []
for est in estimators7:
    y_lb_pred = est.predict(dleaderboard7)
    lb_roc_auc_lst7.append(roc_auc_score(y_lb, y_lb_pred))
print(lb_roc_auc_lst7)
print('Leaderboard score', round(np.mean(lb_roc_auc_lst7), 4),  '+/-', round(np.std(lb_roc_auc_lst7), 3))

[0.8596013654813044, 0.8616407844697681, 0.8569071060003959, 0.8577171905942618, 0.857110774215552]
Leaderboard score 0.8586 +/- 0.002


- Результат по лидерборду стал хуже. На валидационной выборке почти ничего не изменилось

__Задание 7 (опция):__ выполнить предварительную подготовку / очистку признаков P_emaildomain и R_emaildomain (что и как делать - остается на ваше усмотрение) и сделать Frequency Encoding для очищенных признаков.

- Предварительную обработку этих признаков я уже сделал в ДЗ2, очистив от пустых значений, некорректных и выделив в отдельные признаки домен 1-го и 2-го уровня. Сюда сразу подтянул тот подготовленный датасет. Поэтому дальше сделаю Frequency Encoding

In [79]:
train8 = train7.copy()

In [80]:
col_lst = ['P_emaildomain', 'R_emaildomain', 'P_emaildomain_zone', 
           'R_emaildomain_zone', 'P_emaildomain_domain', 'R_emaildomain_domain']
for col in col_lst:
    freq_encoder = train8[col].value_counts(normalize=True)
    train8[f"{col}_freq_enc"] = train8[col].map(freq_encoder)

In [81]:
#Разбиваем признаки по типам: числовые, нечисловые, цель
train8_num_features = train8.select_dtypes("number").columns.to_list()
train8_dum_features = train8.select_dtypes("object").columns.to_list()
train8_target = train8_num_features.pop(1)
len(train8_num_features), len(train8_dum_features), train8_target

(446, 21, 'isFraud')

In [82]:
data8 = train8[train8_num_features + train8_dum_features].apply(le().fit_transform)
y = train8[target]
groups = train8['user_id']

- Добавим новые признаки в датасет лидерборда:

In [83]:
lb_train_df8 = lb_train_df7.copy()

In [84]:
col_lst = ['P_emaildomain', 'R_emaildomain', 'P_emaildomain_zone', 
           'R_emaildomain_zone', 'P_emaildomain_domain', 'R_emaildomain_domain']
for col in col_lst:
    freq_encoder = lb_train_df8[col].value_counts(normalize=True)
    lb_train_df8[f"{col}_freq_enc"] = lb_train_df8[col].map(freq_encoder)

In [85]:
lb_train8 = lb_train_df8[train8_num_features + train8_dum_features].apply(le().fit_transform)
dleaderboard8 = xgb.DMatrix(
    data=lb_train8, label=y_lb
)

- Сделаем кросс-валидацию:

In [86]:
%%time
estimators8, oof_score8, fold_train_scores8, fold_valid_scores8, oof_predictions8 = make_cross_validation(
    data8, y, xgb, metric=roc_auc_score, cv_strategy=cv_strategy, params=params, groups=groups
)

Fold: 1, train-observations = 143981, valid-observations = 35996
train-score = 0.9278, valid-score = 0.886
Fold: 2, train-observations = 143981, valid-observations = 35996
train-score = 0.9321, valid-score = 0.8819
Fold: 3, train-observations = 143982, valid-observations = 35995
train-score = 0.9258, valid-score = 0.8924
Fold: 4, train-observations = 143982, valid-observations = 35995
train-score = 0.9268, valid-score = 0.8963
Fold: 5, train-observations = 143982, valid-observations = 35995
train-score = 0.9289, valid-score = 0.8895
CV-results train: 0.9283 +/- 0.002
CV-results valid: 0.8892 +/- 0.005
OOF-score = 0.8895
Wall time: 40min 3s


In [87]:
lb_roc_auc_lst8 = []
for est in estimators8:
    y_lb_pred = est.predict(dleaderboard8)
    lb_roc_auc_lst8.append(roc_auc_score(y_lb, y_lb_pred))
print(lb_roc_auc_lst8)
print('Leaderboard score', round(np.mean(lb_roc_auc_lst8), 4),  '+/-', round(np.std(lb_roc_auc_lst8), 3))

[0.8597559441610723, 0.8551579693515149, 0.8564882797740226, 0.8552636825803936, 0.8527500775632968]
Leaderboard score 0.8559 +/- 0.002


- Результат по лидерборду стал хуже. На валидационной выборке почти ничего не изменилось

In [88]:
#Сохраняем датасеты для следующего ДЗ
train = train8.to_csv("assignment2_data/train5.csv", index=False)
lb = lb_train_df8.to_csv("assignment2_data/lb5.csv", index=False)