# Итоговый проект по курсу от Megafon

## Часть 2. Препроцессинг. Построение моделей данных

In [179]:
DATA_TRAIN_WITH_FEATURES_PATH = './data/data_train_with_features.csv'
DATA_TEST_WITH_FEATURES_PATH = './data/data_test_with_features.csv'

## Импорт библиотек

In [180]:
import pandas as pd
import numpy as np
import luigi
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from datetime import datetime, date, time, timedelta

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif

# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix
from sklearn.metrics import precision_recall_curve, roc_curve, auc

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import time

In [181]:
RANDOM_STATE = 42
TRAIN_PATH = './data/data_train_with_features.csv'
TEST_PATH = './data/data_test_with_features.csv'

In [182]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

In [183]:
df_train = df_train.drop(columns=['Unnamed: 0'])
df_test = df_test.drop(columns=['Unnamed: 0'])

In [184]:
df_train['buy_time_y'] = df_train['buy_time_y'].apply(lambda time: datetime.fromtimestamp(time))
df_train['buy_time_x'] = df_train['buy_time_x'].apply(lambda time: datetime.fromtimestamp(time))

TypeError: an integer is required (got type str)

In [None]:
df_test['buy_time_x'] = df_test['buy_time_x'].apply(lambda time: datetime.fromtimestamp(time))
df_test['buy_time_y'] = df_test['buy_time_y'].apply(lambda time: datetime.fromtimestamp(time))

In [None]:
X = df_train.drop('target', axis='columns')
y = df_train.target
X_valid = df_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
features = [f for f in df_train.columns if f not in ['target', 'id']]

In [None]:
boolean_features = []
categorical_features = []
numeric_features = []
for col in X[features].fillna(0):
    val_count = len(X[col].unique())
    if val_count == 2:
        boolean_features.append(col)
    elif val_count <= 10:
        categorical_features.append(col)
    else:
        numeric_features.append(col)


In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [None]:
new_features_list = ['interval']

In [None]:
class FeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        try:
            if 'interval' in self.features_list:
                X['interval'] = X['buy_time_y'] - X['buy_time_x']

            return X
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [None]:
num_pipe = Pipeline([
    ('ncs', ColumnSelector(columns=numeric_features)),
    ('nsi', SimpleImputer(strategy="mean")),
    ('nss', StandardScaler()),
])

cat_pipe = Pipeline([
    ('ccs', ColumnSelector(columns=categorical_features)),
    ('csi', SimpleImputer(strategy="most_frequent")),
    ('coe', OneHotEncoder(handle_unknown='ignore')),
])

bool_pipe = Pipeline([
    ('bcs', ColumnSelector(columns=boolean_features)),
    ('bsi', SimpleImputer(strategy="most_frequent")),
])

transformer_list = [('num_pipe', num_pipe), ('cat_pipe', cat_pipe), ('bool_pipe', bool_pipe)]

transform_pipe = Pipeline([
    ('cs', ColumnSelector(columns=features)),
    ('fg', FeaturesGenerator(features_list=['interval'])),
    ('fu', FeatureUnion(transformer_list=transformer_list)),
])


In [None]:
fs_pipe = make_pipeline(
    transform_pipe,
    SelectKBest(k=50, score_func=f_classif),
    SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_STATE), threshold=1e-3),
)

In [None]:
from sklearn import set_config

set_config(display='diagram')

fs_pipe

In [None]:
X_train

Unnamed: 0,id,vas_id,buy_time_x,buy_time_y,0,1,2,3,4,5,...,243,244,245,246,247,248,249,250,251,252
790629,3172210,1.0,2018-08-06,2018-12-17 00:00:00,-63.599971,-353.379112,-77.540786,-405.986798,-82.958246,-427.09179,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
460965,1200409,1.0,2018-08-27,2018-09-10 00:00:00,-90.439971,13.430888,-101.840786,-36.636798,-107.258246,-57.74179,...,4373.626154,4381.229208,-25.996269,532.369552,616.252276,53.167111,-0.694428,66.824067,-0.45614,0.0
335093,2096826,2.0,2018-12-17,2018-07-23 00:00:00,254.890029,-3.949112,283.319214,-14.186798,286.371754,-26.82179,...,-655.373846,-291.770792,-25.996269,-37.630448,-252.747724,-9.832889,-0.694428,3.824067,-0.45614,0.0
681898,4020404,1.0,2018-08-13,2018-07-09 00:00:00,-96.799971,-111.569112,-110.740786,-164.176798,-116.158246,-185.28179,...,-910.373846,-613.770792,-2.996269,-37.630448,-179.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
181624,3816642,1.0,2018-08-13,2018-08-06 00:00:00,-96.799971,34.630888,-104.800786,5.753202,-110.218246,-15.35179,...,-776.373846,-596.770792,-24.996269,14.369552,-118.747724,-21.832889,-0.694428,-10.175933,1.54386,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,3733267,1.0,2018-11-05,2018-11-05 00:00:00,-95.269971,301.680888,-109.210786,604.993202,-114.628246,583.88821,...,-956.373846,-592.770792,-25.996269,-37.630448,-267.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0
365838,2075873,1.0,2018-08-27,2018-12-17 00:00:00,-96.799971,-321.909112,-98.030786,-310.966798,-103.448246,-332.07179,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0
131932,2983125,2.0,2018-10-01,2018-08-06 00:00:00,-96.799971,439.280888,96.879214,623.953202,91.461754,602.84821,...,-379.373846,-181.770792,-8.996269,-37.630448,-117.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0
671155,983481,1.0,2018-12-24,2018-12-10 00:00:00,-87.119971,-241.569112,-101.060786,-294.176798,-106.478246,-315.28179,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0


In [None]:
y_train.head(5)

790629    0.0
460965    0.0
335093    0.0
681898    0.0
181624    0.0
Name: target, dtype: float64

In [None]:
fs_pipe.fit(X_train, y_train)

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [None]:
X_valid_transform = fs_pipe.transform(X_valid)

NotFittedError: This SimpleImputer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.