# Итоговый проект по курсу от Megafon

## Часть 2. Препроцессинг. Построение моделей данных

In [189]:
DATA_TRAIN_WITH_FEATURES_PATH = './data/data_train_with_features.csv'
DATA_TEST_WITH_FEATURES_PATH = './data/data_test_with_features.csv'

## Импорт библиотек

In [220]:
import pandas as pd
from datetime import datetime

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostClassifier
from imblearn.over_sampling import RandomOverSampler

def convert_timestamp_to_datetime(dataframe, colname):
    dataframe[colname] = dataframe[colname].apply(lambda timestamp: datetime.fromtimestamp(timestamp))
    return dataframe

In [191]:
RANDOM_STATE = 42
TRAIN_PATH = './data/data_train_with_features.csv'
TEST_PATH = './data/data_test_with_features.csv'

In [192]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

In [193]:
df_train = df_train.drop(columns=['Unnamed: 0'])
df_test = df_test.drop(columns=['Unnamed: 0'])

In [195]:
X = df_train.drop('target', axis='columns')
y = df_train.target
X_valid = df_test

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [197]:
features = [f for f in df_train.columns if f not in ['target', 'id']]

In [198]:
boolean_features = []
categorical_features = []
numeric_features = []
for col in X[features].fillna(0):
    val_count = len(X[col].unique())
    if val_count == 2:
        boolean_features.append(col)
    elif val_count <= 10:
        categorical_features.append(col)
    else:
        numeric_features.append(col)


In [199]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [200]:
new_features_list = ['interval']

In [201]:
class FeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        try:
            if 'interval' in self.features_list:
                X['interval'] = X['buy_time_y'] - X['buy_time_x']

            return X
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [202]:
num_pipe = Pipeline([
    ('ncs', ColumnSelector(columns=numeric_features)),
    ('nsi', SimpleImputer(strategy="mean")),
    ('nss', StandardScaler()),
])

cat_pipe = Pipeline([
    ('ccs', ColumnSelector(columns=categorical_features)),
    ('csi', SimpleImputer(strategy="most_frequent")),
    ('coe', OneHotEncoder(handle_unknown='ignore')),
])

bool_pipe = Pipeline([
    ('bcs', ColumnSelector(columns=boolean_features)),
    ('bsi', SimpleImputer(strategy="most_frequent")),
])

transformer_list = [('num_pipe', num_pipe), ('cat_pipe', cat_pipe), ('bool_pipe', bool_pipe)]

transform_pipe = Pipeline([
    ('cs', ColumnSelector(columns=features)),
    ('fg', FeaturesGenerator(features_list=['interval'])),
    ('fu', FeatureUnion(transformer_list=transformer_list)),
])


In [203]:
fs_pipe = make_pipeline(
    transform_pipe,
    SelectKBest(k=50, score_func=f_classif),
    SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_STATE), threshold=1e-3),
)

In [204]:
from sklearn import set_config

set_config(display='diagram')

fs_pipe

In [205]:
X_train

Unnamed: 0,id,vas_id,buy_time_x,buy_time_y,0,1,2,3,4,5,...,243,244,245,246,247,248,249,250,251,252
62864,1544130,6.0,1542574800,1538946000,166.590029,-47.239112,152.649214,-99.846798,147.231754,-120.95179,...,-950.373846,-613.770792,-25.996269,-37.630448,-279.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
412059,2751512,1.0,1539550800,1544994000,94.730029,-202.239112,89.269214,-246.366798,83.851754,-267.47179,...,-976.373846,-613.770792,-25.996269,-37.630448,-300.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
291060,345990,8.0,1536526800,1532293200,-96.799971,142.660888,-50.150786,429.853202,-55.568246,408.74821,...,-566.373846,-409.770792,-2.996269,-37.630448,-230.747724,75.167111,-0.694428,88.824067,-0.45614,1.0
205651,3135452,2.0,1532898000,1546203600,-96.799971,-208.989112,-106.500786,-202.276798,-111.918246,-223.38179,...,-885.373846,-612.770792,-25.996269,-37.630448,-290.747724,-10.832889,-0.694428,-12.175933,-0.45614,0.0
715184,2629066,6.0,1544389200,1534712400,-96.799971,-408.179112,-110.740786,-460.786798,-116.158246,-481.89179,...,-771.373846,-436.770792,-25.996269,-37.630448,1045.252276,-6.832889,-0.694428,4.824067,1.54386,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,1261192,1.0,1533502800,1535317200,589.640029,278.260888,575.699214,235.493202,570.281754,214.38821,...,-966.373846,-613.770792,-25.996269,-18.630448,-185.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0
365838,2315815,6.0,1544994000,1535317200,-96.799971,-1.399112,-102.260786,-15.106798,-107.678246,-36.21179,...,-976.373846,-612.770792,-24.996269,39.369552,-210.747724,4.167111,-0.694428,17.824067,-0.45614,0.0
131932,3538115,5.0,1544994000,1544389200,-96.799971,-408.179112,-110.740786,-412.886798,-116.158246,-433.99179,...,-820.373846,-613.770792,-22.996269,-33.630448,-191.747724,-22.832889,-0.694428,-12.175933,-0.45614,1.0
671155,1471231,1.0,1545598800,1538341200,-94.759971,-406.139112,-108.700786,-458.746798,-114.118246,-479.85179,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0


In [206]:
y_train.head(5)

62864     1.0
412059    0.0
291060    0.0
205651    0.0
715184    0.0
Name: target, dtype: float64

In [207]:
fs_pipe.fit(X_train, y_train)

In [208]:
X_test_transform = fs_pipe.transform(X_test)
X_train_transform = fs_pipe.transform(X_train)
X_valid_transform = fs_pipe.transform(X_valid)

In [211]:
ros = RandomOverSampler(random_state=42)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [215]:
model = CatBoostClassifier(random_state=RANDOM_STATE)

In [216]:
model.fit(X_ros, y_ros)

Learning rate set to 0.218384
0:	learn: 0.5255035	total: 333ms	remaining: 5m 33s
1:	learn: 0.4374940	total: 578ms	remaining: 4m 48s
2:	learn: 0.3810208	total: 810ms	remaining: 4m 29s
3:	learn: 0.3567558	total: 1.02s	remaining: 4m 13s
4:	learn: 0.3439363	total: 1.23s	remaining: 4m 4s
5:	learn: 0.3354808	total: 1.46s	remaining: 4m 1s
6:	learn: 0.3306209	total: 1.68s	remaining: 3m 58s
7:	learn: 0.3274782	total: 1.9s	remaining: 3m 55s
8:	learn: 0.3257247	total: 2.12s	remaining: 3m 53s
9:	learn: 0.3239108	total: 2.31s	remaining: 3m 49s
10:	learn: 0.3203684	total: 2.53s	remaining: 3m 47s
11:	learn: 0.3190910	total: 2.73s	remaining: 3m 45s
12:	learn: 0.3185365	total: 2.95s	remaining: 3m 43s
13:	learn: 0.3179819	total: 3.15s	remaining: 3m 41s
14:	learn: 0.3175594	total: 3.38s	remaining: 3m 41s
15:	learn: 0.3172565	total: 3.56s	remaining: 3m 38s
16:	learn: 0.3169750	total: 3.77s	remaining: 3m 37s
17:	learn: 0.3168193	total: 3.96s	remaining: 3m 36s
18:	learn: 0.3166389	total: 4.2s	remaining: 3m 

<catboost.core.CatBoostClassifier at 0x7fb93f072a00>

In [221]:
y_test_pred = model.predict(X_test)

In [222]:
f1_score(
  y_test,
  y_test_pred,
  average='macro'
)

0.7208534165295255

In [217]:
y_valid = model.predict(X_valid)

# Сохраняем результат

In [218]:
answers_test = X_valid.copy()
answers_test['target'] = y_valid

In [219]:
answers_test.to_csv('answers_test.csv')

In [231]:
import pickle

with open('model.pkl', 'wb') as file:
  pickle.dump(model, file)
  