In [5]:
# Работа с признаками. Практика

import os

import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)
from dotenv import load_dotenv

TABLE_NAME = "users_churn" # таблица с данными

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_kruglikovAlex' # название эксперимента
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = 'churn_model_kruglikovAlex_b2c' # название зарегистрированной модели 

In [6]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

In [7]:
# подгружаем .env
load_dotenv()

# загрузка данных для обучения модели
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

In [6]:
pip install psycopg2-binary

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [78]:
connection.update(postgres_credentials)

import psycopg

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

In [79]:
df.head(5)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,28173,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,,0
1,28174,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
2,28175,3668-QPYBK,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1
3,28176,7795-CFOCW,2016-05-01,NaT,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,,0
4,28177,9237-HQITU,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1


In [80]:
obj_df = df.select_dtypes(include="object")
obj_df[:5]

Unnamed: 0,customer_id,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,partner,dependents,multiple_lines
0,7590-VHVEG,Month-to-month,Yes,Electronic check,DSL,No,Yes,No,No,No,No,Female,Yes,No,
1,5575-GNVDE,One year,No,Mailed check,DSL,Yes,No,Yes,No,No,No,Male,No,No,No
2,3668-QPYBK,Month-to-month,Yes,Mailed check,DSL,Yes,Yes,No,No,No,No,Male,No,No,No
3,7795-CFOCW,One year,No,Bank transfer (automatic),DSL,Yes,No,Yes,Yes,No,No,Male,No,No,
4,9237-HQITU,Month-to-month,Yes,Electronic check,Fiber optic,No,No,No,No,No,No,Female,No,No,No


In [81]:
# Задание 1
# Преобразуйте категориальные колонки с помощью OneHotEncoding. Используйте автоопределение 
# категорий. Поставьте ограничение на максимальное количество категорий — 10, игнорируйте неизвестные 
# категории во время преобразования и удалите первую категорию, чтобы исключить проблему коллинеарности.

# определение категориальных колонок, которые будут преобразованы
cat_columns = ["type", "payment_method", "internet_service", "gender"]

In [82]:
# создание объекта OneHotEncoder для преобразования категориальных переменных
# auto - автоматическое определение категорий
# ignore - игнорировать ошибки, если встречается неизвестная категория
# max_categories - максимальное количество уникальных категорий
# sparse_output - вывод в виде разреженной матрицы, если False, то в виде обычного массива
# drop="first" - удаляет первую категорию, чтобы избежать ловушки мультиколлинеарности
encoder_oh = OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first')


In [83]:
encoder_oh

In [84]:
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())

In [85]:
encoded_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 1., 0., ..., 1., 0., 1.]])

In [86]:
# преобразование полученных признаков в DataFrame и установка названий колонок
# get_feature_names_out() - получение имён признаков после преобразования
encoded_df = pd.DataFrame(encoded_features, columns=encoder_oh.get_feature_names_out())
encoder_oh.get_feature_names_out()

array(['x0_One year', 'x0_Two year', 'x1_Credit card (automatic)',
       'x1_Electronic check', 'x1_Mailed check', 'x2_Fiber optic',
       'x2_None', 'x3_Male'], dtype=object)

In [87]:
encoded_df.head()

Unnamed: 0,x0_One year,x0_Two year,x1_Credit card (automatic),x1_Electronic check,x1_Mailed check,x2_Fiber optic,x2_None,x3_Male
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [88]:
encoder_oh.get_feature_names_out().reshape(1,-1)

array([['x0_One year', 'x0_Two year', 'x1_Credit card (automatic)',
        'x1_Electronic check', 'x1_Mailed check', 'x2_Fiber optic',
        'x2_None', 'x3_Male']], dtype=object)

In [89]:
pd.DataFrame(encoded_df)

Unnamed: 0,x0_One year,x0_Two year,x1_Credit card (automatic),x1_Electronic check,x1_Mailed check,x2_Fiber optic,x2_None,x3_Male
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
7038,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
7039,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
7040,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7041,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


In [90]:
# конкатенация исходного DataFrame с новым DataFrame, содержащим закодированные категориальные признаки
# axis=1 означает конкатенацию по колонкам
obj_df = pd.concat([obj_df, encoded_df], axis=1)

obj_df.head(2)

Unnamed: 0,customer_id,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,partner,dependents,multiple_lines,x0_One year,x0_Two year,x1_Credit card (automatic),x1_Electronic check,x1_Mailed check,x2_Fiber optic,x2_None,x3_Male
0,7590-VHVEG,Month-to-month,Yes,Electronic check,DSL,No,Yes,No,No,No,No,Female,Yes,No,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,5575-GNVDE,One year,No,Mailed check,DSL,Yes,No,Yes,No,No,No,Male,No,No,No,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [91]:
# Задание 2
# Напишите код преобразования числовых признаков в списке num_columns, используя следующие энкодеры:
# SplineTransformer,
# QuantileTransformer,
# RobustScaler,
# PolynomialFeatures,
# KBinsDiscretizer.

In [8]:
data = pd.read_csv('/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv')

In [9]:
data.head()
df = data

In [10]:
num_columns = ["monthly_charges", "total_charges"]

In [95]:
# параметры преобразователей
n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

In [96]:
# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline) # ваш код здесь #
encoder_spl

In [97]:
encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy()) # ваш код здесь #

In [98]:
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_spl.get_feature_names_out(num_columns)
)
num_df = pd.concat([df, encoded_df], axis=1)

In [99]:
num_df.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0,0.014583,0.335266,0.554993,0.09504,0.000118,0.0,0.041243,0.457057,0.459607,0.042093,1.762313e-12,0.0
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001447607,0.0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1,0.000301,0.114432,0.572271,0.302499,0.010496,0.0,0.038335,0.447921,0.468533,0.045211,7.533768e-09,0.0
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0,0.003079,0.207835,0.598672,0.188228,0.002186,0.0,0.0047,0.235853,0.595016,0.163129,0.001302506,0.0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1,0.0,0.034835,0.436005,0.479704,0.049456,1.530859e-07,0.036787,0.442783,0.473414,0.047016,3.68197e-08,0.0


In [100]:
# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles) # ваш код здесь #
encoder_q

In [101]:
encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy()) # ваш код здесь #
encoded_features


array([[0.23213724, 0.02483202],
       [0.38989899, 0.5789275 ],
       [0.35353535, 0.11649739],
       ...,
       [0.2314934 , 0.22884664],
       [0.5390525 , 0.21302932],
       [0.93497475, 0.94618654]])

In [102]:
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_q.get_feature_names_out(num_columns)
)# ваш код здесь #
encoded_df.head()

Unnamed: 0,monthly_charges,total_charges
0,0.232137,0.024832
1,0.389899,0.578927
2,0.353535,0.116497
3,0.267536,0.571996
4,0.505612,0.138627


In [103]:
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
encoded_df.columns

Index(['monthly_charges_q_100', 'total_charges_q_100'], dtype='object')

In [104]:
num_df = pd.concat([num_df, encoded_df], axis=1)

In [105]:
num_df.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5,monthly_charges_q_100,total_charges_q_100
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0,0.014583,0.335266,0.554993,0.09504,0.000118,0.0,0.041243,0.457057,0.459607,0.042093,1.762313e-12,0.0,0.232137,0.024832
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001447607,0.0,0.389899,0.578927
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1,0.000301,0.114432,0.572271,0.302499,0.010496,0.0,0.038335,0.447921,0.468533,0.045211,7.533768e-09,0.0,0.353535,0.116497
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0,0.003079,0.207835,0.598672,0.188228,0.002186,0.0,0.0047,0.235853,0.595016,0.163129,0.001302506,0.0,0.267536,0.571996
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1,0.0,0.034835,0.436005,0.479704,0.049456,1.530859e-07,0.036787,0.442783,0.473414,0.047016,3.68197e-08,0.0,0.505612,0.138627


In [106]:
# RobustScaler
encoder_rb = RobustScaler() # ваш код здесь #
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy()) # ваш код здесь #
encoded_features

array([[-0.74919169, -0.40699354],
       [-0.24849885,  0.14130082],
       [-0.30577367, -0.38390777],
       ...,
       [-0.75381062, -0.31364802],
       [ 0.073903  , -0.32539729],
       [ 0.65127021,  1.60222013]])

In [107]:
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_rb.get_feature_names_out(num_columns)
)# ваш код здесь #
encoded_df.head()

Unnamed: 0,monthly_charges,total_charges
0,-0.749192,-0.406994
1,-0.248499,0.141301
2,-0.305774,-0.383908
3,-0.519169,0.126927
4,0.005543,-0.371082


In [108]:
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [109]:
num_df.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5,monthly_charges_q_100,total_charges_q_100,monthly_charges_robust,total_charges_robust
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0,0.014583,0.335266,0.554993,0.09504,0.000118,0.0,0.041243,0.457057,0.459607,0.042093,1.762313e-12,0.0,0.232137,0.024832,-0.749192,-0.406994
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001447607,0.0,0.389899,0.578927,-0.248499,0.141301
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1,0.000301,0.114432,0.572271,0.302499,0.010496,0.0,0.038335,0.447921,0.468533,0.045211,7.533768e-09,0.0,0.353535,0.116497,-0.305774,-0.383908
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0,0.003079,0.207835,0.598672,0.188228,0.002186,0.0,0.0047,0.235853,0.595016,0.163129,0.001302506,0.0,0.267536,0.571996,-0.519169,0.126927
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1,0.0,0.034835,0.436005,0.479704,0.049456,1.530859e-07,0.036787,0.442783,0.473414,0.047016,3.68197e-08,0.0,0.505612,0.138627,0.005543,-0.371082


In [110]:
# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)# ваш код здесь #
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy()) # ваш код здесь #
encoder_pol

In [111]:
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_pol.get_feature_names_out(num_columns)
)# ваш код здесь #


In [112]:
encoded_df.head()

Unnamed: 0,1,monthly_charges,total_charges,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3
0,1.0,29.85,29.85,891.0225,891.0225,891.0225,26597.021625,26597.02,26597.02,26597.02
1,1.0,56.95,1889.5,3243.3025,107607.025,3570210.0,184706.077375,6128220.0,203323500.0,6745912000.0
2,1.0,53.85,108.15,2899.8225,5823.8775,11696.42,156155.441625,313615.8,629852.4,1264968.0
3,1.0,42.3,1840.75,1789.29,77863.725,3388361.0,75686.967,3293636.0,143327700.0,6237125000.0
4,1.0,70.7,151.65,4998.49,10721.655,22997.72,353393.243,758021.0,1625939.0,3487605.0


In [113]:
encoded_df = encoded_df.iloc[:, 1 + len(num_columns):]
encoded_df.head()

Unnamed: 0,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3
0,891.0225,891.0225,891.0225,26597.021625,26597.02,26597.02,26597.02
1,3243.3025,107607.025,3570210.0,184706.077375,6128220.0,203323500.0,6745912000.0
2,2899.8225,5823.8775,11696.42,156155.441625,313615.8,629852.4,1264968.0
3,1789.29,77863.725,3388361.0,75686.967,3293636.0,143327700.0,6237125000.0
4,4998.49,10721.655,22997.72,353393.243,758021.0,1625939.0,3487605.0


In [114]:
encoded_df.columns

Index(['monthly_charges^2', 'monthly_charges total_charges', 'total_charges^2',
       'monthly_charges^3', 'monthly_charges^2 total_charges',
       'monthly_charges total_charges^2', 'total_charges^3'],
      dtype='object')

In [115]:
encoded_df.columns = [col + f"_poly" for col in encoded_df.columns]
encoded_df.head()

Unnamed: 0,monthly_charges^2_poly,monthly_charges total_charges_poly,total_charges^2_poly,monthly_charges^3_poly,monthly_charges^2 total_charges_poly,monthly_charges total_charges^2_poly,total_charges^3_poly
0,891.0225,891.0225,891.0225,26597.021625,26597.02,26597.02,26597.02
1,3243.3025,107607.025,3570210.0,184706.077375,6128220.0,203323500.0,6745912000.0
2,2899.8225,5823.8775,11696.42,156155.441625,313615.8,629852.4,1264968.0
3,1789.29,77863.725,3388361.0,75686.967,3293636.0,143327700.0,6237125000.0
4,4998.49,10721.655,22997.72,353393.243,758021.0,1625939.0,3487605.0


In [116]:
num_df = pd.concat([num_df, encoded_df], axis=1)

In [117]:
num_df.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5,monthly_charges_q_100,total_charges_q_100,monthly_charges_robust,total_charges_robust,monthly_charges^2_poly,monthly_charges total_charges_poly,total_charges^2_poly,monthly_charges^3_poly,monthly_charges^2 total_charges_poly,monthly_charges total_charges^2_poly,total_charges^3_poly
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0,0.014583,0.335266,0.554993,0.09504,0.000118,0.0,0.041243,0.457057,0.459607,0.042093,1.762313e-12,0.0,0.232137,0.024832,-0.749192,-0.406994,891.0225,891.0225,891.0225,26597.021625,26597.02,26597.02,26597.02
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001447607,0.0,0.389899,0.578927,-0.248499,0.141301,3243.3025,107607.025,3570210.0,184706.077375,6128220.0,203323500.0,6745912000.0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1,0.000301,0.114432,0.572271,0.302499,0.010496,0.0,0.038335,0.447921,0.468533,0.045211,7.533768e-09,0.0,0.353535,0.116497,-0.305774,-0.383908,2899.8225,5823.8775,11696.42,156155.441625,313615.8,629852.4,1264968.0
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0,0.003079,0.207835,0.598672,0.188228,0.002186,0.0,0.0047,0.235853,0.595016,0.163129,0.001302506,0.0,0.267536,0.571996,-0.519169,0.126927,1789.29,77863.725,3388361.0,75686.967,3293636.0,143327700.0,6237125000.0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1,0.0,0.034835,0.436005,0.479704,0.049456,1.530859e-07,0.036787,0.442783,0.473414,0.047016,3.68197e-08,0.0,0.505612,0.138627,0.005543,-0.371082,4998.49,10721.655,22997.72,353393.243,758021.0,1625939.0,3487605.0


In [118]:
# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode,strategy=strategy, subsample=subsample) # ваш код здесь #
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy()) # ваш код здесь #
encoder_kbd 

In [119]:
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_kbd.get_feature_names_out(num_columns)
) # ваш код здесь #
encoded_df.head()

Unnamed: 0,monthly_charges,total_charges
0,0.0,0.0
1,1.0,1.0
2,1.0,0.0
3,1.0,1.0
4,2.0,0.0


In [120]:
encoded_df.columns = [col + f"_bin" for col in num_columns]
encoded_df.head()

Unnamed: 0,monthly_charges_bin,total_charges_bin
0,0.0,0.0
1,1.0,1.0
2,1.0,0.0
3,1.0,1.0
4,2.0,0.0


In [121]:
num_df = pd.concat([num_df, encoded_df], axis=1)

In [122]:
num_df.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5,monthly_charges_q_100,total_charges_q_100,monthly_charges_robust,total_charges_robust,monthly_charges^2_poly,monthly_charges total_charges_poly,total_charges^2_poly,monthly_charges^3_poly,monthly_charges^2 total_charges_poly,monthly_charges total_charges^2_poly,total_charges^3_poly,monthly_charges_bin,total_charges_bin
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0,0.014583,0.335266,0.554993,0.09504,0.000118,0.0,0.041243,0.457057,0.459607,0.042093,1.762313e-12,0.0,0.232137,0.024832,-0.749192,-0.406994,891.0225,891.0225,891.0225,26597.021625,26597.02,26597.02,26597.02,0.0,0.0
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001447607,0.0,0.389899,0.578927,-0.248499,0.141301,3243.3025,107607.025,3570210.0,184706.077375,6128220.0,203323500.0,6745912000.0,1.0,1.0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1,0.000301,0.114432,0.572271,0.302499,0.010496,0.0,0.038335,0.447921,0.468533,0.045211,7.533768e-09,0.0,0.353535,0.116497,-0.305774,-0.383908,2899.8225,5823.8775,11696.42,156155.441625,313615.8,629852.4,1264968.0,1.0,0.0
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0,0.003079,0.207835,0.598672,0.188228,0.002186,0.0,0.0047,0.235853,0.595016,0.163129,0.001302506,0.0,0.267536,0.571996,-0.519169,0.126927,1789.29,77863.725,3388361.0,75686.967,3293636.0,143327700.0,6237125000.0,1.0,1.0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1,0.0,0.034835,0.436005,0.479704,0.049456,1.530859e-07,0.036787,0.442783,0.473414,0.047016,3.68197e-08,0.0,0.505612,0.138627,0.005543,-0.371082,4998.49,10721.655,22997.72,353393.243,758021.0,1625939.0,3487605.0,2.0,0.0


In [123]:
# Задание 3
# Напишите код, который объединит преобразования над числовыми колонками в ColumnTransformer, 
# а над категориальными — в Pipeline, используя энкодеры из предыдущих заданий. 
# Затем объедините два получившихся объекта класса одним колоночным преобразованием. 
# После чего объедините ваш преобразованный набор данных с изначальным, а результат сохраните в переменную df. 

In [11]:
# загрузка данных
data = pd.read_csv('/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv')
data.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1


In [12]:
df = data

In [13]:
# параметры преобразователей
n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

In [14]:
num_columns = df.select_dtypes(include=["float"]).columns.to_numpy()
num_columns

array(['monthly_charges', 'total_charges'], dtype=object)

In [15]:
numeric_transformer = ColumnTransformer(
    transformers=[
        ('Spline', SplineTransformer(n_knots=n_knots, degree=degree_spline), num_columns),
        ('Quantile', QuantileTransformer(n_quantiles=n_quantiles), num_columns),
        ('RobustScaler', RobustScaler(), num_columns),
        ('Polynomial', PolynomialFeatures(degree=degree), num_columns),
        ('KBinsDiscretizer', KBinsDiscretizer(n_bins=n_bins, encode=encode,strategy=strategy, subsample=subsample), num_columns),
        ]
)

In [16]:
numeric_transformer

In [17]:
obj_df = df.select_dtypes(include="object")
obj_df.head(5)

Unnamed: 0,begin_date,end_date,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,partner,dependents,multiple_lines
0,2020-01-01,,Month-to-month,Yes,Electronic check,DSL,No,Yes,No,No,No,No,Female,Yes,No,No
1,2017-04-01,,One year,No,Mailed check,DSL,Yes,No,Yes,No,No,No,Male,No,No,No
2,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,DSL,Yes,Yes,No,No,No,No,Male,No,No,No
3,2016-05-01,,One year,No,Bank transfer (automatic),DSL,Yes,No,Yes,Yes,No,No,Male,No,No,No
4,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,Fiber optic,No,No,No,No,No,No,Female,No,No,No


In [18]:
# определение категориальных колонок, которые будут преобразованы
cat_columns = ["type", "payment_method", "internet_service", "gender"]

In [19]:
categorical_transformer = Pipeline(steps=[('OneHot', OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first'))])

In [20]:
categorical_transformer

In [21]:
preprocessor = ColumnTransformer(
	# ваш код здесь #
	transformers=[
		('num_fg', numeric_transformer, num_columns),
        ('cat_fg', categorical_transformer, cat_columns)
	],n_jobs=-1
)

In [22]:
preprocessor

In [23]:
encoded_features = preprocessor.fit_transform(df) # ваш код здесь #

In [24]:
preprocessor.get_feature_names_out()

array(['num_fg__Spline__monthly_charges_sp_0',
       'num_fg__Spline__monthly_charges_sp_1',
       'num_fg__Spline__monthly_charges_sp_2',
       'num_fg__Spline__monthly_charges_sp_3',
       'num_fg__Spline__monthly_charges_sp_4',
       'num_fg__Spline__monthly_charges_sp_5',
       'num_fg__Spline__total_charges_sp_0',
       'num_fg__Spline__total_charges_sp_1',
       'num_fg__Spline__total_charges_sp_2',
       'num_fg__Spline__total_charges_sp_3',
       'num_fg__Spline__total_charges_sp_4',
       'num_fg__Spline__total_charges_sp_5',
       'num_fg__Quantile__monthly_charges',
       'num_fg__Quantile__total_charges',
       'num_fg__RobustScaler__monthly_charges',
       'num_fg__RobustScaler__total_charges', 'num_fg__Polynomial__1',
       'num_fg__Polynomial__monthly_charges',
       'num_fg__Polynomial__total_charges',
       'num_fg__Polynomial__monthly_charges^2',
       'num_fg__Polynomial__monthly_charges total_charges',
       'num_fg__Polynomial__total_charges^2',

In [25]:
transformed_df = pd.DataFrame(
    encoded_features, 
    columns=preprocessor.get_feature_names_out()
) # ваш код здесь #
transformed_df.head()

Unnamed: 0,num_fg__Spline__monthly_charges_sp_0,num_fg__Spline__monthly_charges_sp_1,num_fg__Spline__monthly_charges_sp_2,num_fg__Spline__monthly_charges_sp_3,num_fg__Spline__monthly_charges_sp_4,num_fg__Spline__monthly_charges_sp_5,num_fg__Spline__total_charges_sp_0,num_fg__Spline__total_charges_sp_1,num_fg__Spline__total_charges_sp_2,num_fg__Spline__total_charges_sp_3,num_fg__Spline__total_charges_sp_4,num_fg__Spline__total_charges_sp_5,num_fg__Quantile__monthly_charges,num_fg__Quantile__total_charges,num_fg__RobustScaler__monthly_charges,num_fg__RobustScaler__total_charges,num_fg__Polynomial__1,num_fg__Polynomial__monthly_charges,num_fg__Polynomial__total_charges,num_fg__Polynomial__monthly_charges^2,num_fg__Polynomial__monthly_charges total_charges,num_fg__Polynomial__total_charges^2,num_fg__Polynomial__monthly_charges^3,num_fg__Polynomial__monthly_charges^2 total_charges,num_fg__Polynomial__monthly_charges total_charges^2,num_fg__Polynomial__total_charges^3,num_fg__KBinsDiscretizer__monthly_charges,num_fg__KBinsDiscretizer__total_charges,cat_fg__type_One year,cat_fg__type_Two year,cat_fg__payment_method_Credit card (automatic),cat_fg__payment_method_Electronic check,cat_fg__payment_method_Mailed check,cat_fg__internet_service_Fiber optic,cat_fg__gender_Male
0,0.014583,0.335266,0.554993,0.09504,0.000118,0.0,0.041243,0.457057,0.459607,0.042093,1.762313e-12,0.0,0.232137,0.024832,-0.749192,-0.406994,1.0,29.85,29.85,891.0225,891.0225,891.0225,26597.021625,26597.02,26597.02,26597.02,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001447607,0.0,0.389899,0.578927,-0.248499,0.141301,1.0,56.95,1889.5,3243.3025,107607.025,3570210.0,184706.077375,6128220.0,203323500.0,6745912000.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.000301,0.114432,0.572271,0.302499,0.010496,0.0,0.038335,0.447921,0.468533,0.045211,7.533768e-09,0.0,0.353535,0.116497,-0.305774,-0.383908,1.0,53.85,108.15,2899.8225,5823.8775,11696.42,156155.441625,313615.8,629852.4,1264968.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.003079,0.207835,0.598672,0.188228,0.002186,0.0,0.0047,0.235853,0.595016,0.163129,0.001302506,0.0,0.267536,0.571996,-0.519169,0.126927,1.0,42.3,1840.75,1789.29,77863.725,3388361.0,75686.967,3293636.0,143327700.0,6237125000.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.034835,0.436005,0.479704,0.049456,1.530859e-07,0.036787,0.442783,0.473414,0.047016,3.68197e-08,0.0,0.505612,0.138627,0.005543,-0.371082,1.0,70.7,151.65,4998.49,10721.655,22997.72,353393.243,758021.0,1625939.0,3487605.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [26]:
num_df = pd.concat([df, transformed_df], axis=1)

In [27]:
num_df.head(5)

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,num_fg__Spline__monthly_charges_sp_0,num_fg__Spline__monthly_charges_sp_1,num_fg__Spline__monthly_charges_sp_2,num_fg__Spline__monthly_charges_sp_3,num_fg__Spline__monthly_charges_sp_4,num_fg__Spline__monthly_charges_sp_5,num_fg__Spline__total_charges_sp_0,num_fg__Spline__total_charges_sp_1,num_fg__Spline__total_charges_sp_2,num_fg__Spline__total_charges_sp_3,num_fg__Spline__total_charges_sp_4,num_fg__Spline__total_charges_sp_5,num_fg__Quantile__monthly_charges,num_fg__Quantile__total_charges,num_fg__RobustScaler__monthly_charges,num_fg__RobustScaler__total_charges,num_fg__Polynomial__1,num_fg__Polynomial__monthly_charges,num_fg__Polynomial__total_charges,num_fg__Polynomial__monthly_charges^2,num_fg__Polynomial__monthly_charges total_charges,num_fg__Polynomial__total_charges^2,num_fg__Polynomial__monthly_charges^3,num_fg__Polynomial__monthly_charges^2 total_charges,num_fg__Polynomial__monthly_charges total_charges^2,num_fg__Polynomial__total_charges^3,num_fg__KBinsDiscretizer__monthly_charges,num_fg__KBinsDiscretizer__total_charges,cat_fg__type_One year,cat_fg__type_Two year,cat_fg__payment_method_Credit card (automatic),cat_fg__payment_method_Electronic check,cat_fg__payment_method_Mailed check,cat_fg__internet_service_Fiber optic,cat_fg__gender_Male
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0,0.014583,0.335266,0.554993,0.09504,0.000118,0.0,0.041243,0.457057,0.459607,0.042093,1.762313e-12,0.0,0.232137,0.024832,-0.749192,-0.406994,1.0,29.85,29.85,891.0225,891.0225,891.0225,26597.021625,26597.02,26597.02,26597.02,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001447607,0.0,0.389899,0.578927,-0.248499,0.141301,1.0,56.95,1889.5,3243.3025,107607.025,3570210.0,184706.077375,6128220.0,203323500.0,6745912000.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1,0.000301,0.114432,0.572271,0.302499,0.010496,0.0,0.038335,0.447921,0.468533,0.045211,7.533768e-09,0.0,0.353535,0.116497,-0.305774,-0.383908,1.0,53.85,108.15,2899.8225,5823.8775,11696.42,156155.441625,313615.8,629852.4,1264968.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0,0.003079,0.207835,0.598672,0.188228,0.002186,0.0,0.0047,0.235853,0.595016,0.163129,0.001302506,0.0,0.267536,0.571996,-0.519169,0.126927,1.0,42.3,1840.75,1789.29,77863.725,3388361.0,75686.967,3293636.0,143327700.0,6237125000.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1,0.0,0.034835,0.436005,0.479704,0.049456,1.530859e-07,0.036787,0.442783,0.473414,0.047016,3.68197e-08,0.0,0.505612,0.138627,0.005543,-0.371082,1.0,70.7,151.65,4998.49,10721.655,22997.72,353393.243,758021.0,1625939.0,3487605.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [None]:
numeric_transformer = ColumnTransformer(
    transformers=[
        ('spl', encoder_spl, num_columns), 
        ('q', encoder_q, num_columns), 
        ('rb', encoder_rb, num_columns), 
        ('pol', encoder_pol, num_columns), 
        ('kbd', encoder_kbd, num_columns)
        ]
    )

In [166]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" # ваш код здесь
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id

'1'

In [167]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.sklearn.log_model(preprocessor, "column_transformer") 



In [168]:
run_id

'9272f80250ee4e9c85e0002b3eb517d6'

In [28]:
import joblib

In [39]:
from catboost import CatBoostClassifier
# загрузка модели
model = CatBoostClassifier(auto_class_weights='Balanced')

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    num_df,
    num_df['target'],
    stratify=data['target']
)

In [42]:
pipeline_new = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

In [43]:
pipeline_new 

In [45]:
# обучите пайплайн
pipeline_new.fit(X_train, y_train)
y_pred = pipeline_new.predict(X_test)

Learning rate set to 0.020938
0:	learn: 0.6827412	total: 67.2ms	remaining: 1m 7s
1:	learn: 0.6736929	total: 74.5ms	remaining: 37.2s
2:	learn: 0.6644810	total: 81.5ms	remaining: 27.1s
3:	learn: 0.6554321	total: 88.2ms	remaining: 22s
4:	learn: 0.6470166	total: 94.7ms	remaining: 18.8s
5:	learn: 0.6394136	total: 101ms	remaining: 16.8s
6:	learn: 0.6324129	total: 108ms	remaining: 15.3s
7:	learn: 0.6261737	total: 115ms	remaining: 14.2s
8:	learn: 0.6188582	total: 122ms	remaining: 13.4s
9:	learn: 0.6122623	total: 128ms	remaining: 12.7s
10:	learn: 0.6061060	total: 137ms	remaining: 12.3s
11:	learn: 0.6006796	total: 143ms	remaining: 11.8s
12:	learn: 0.5956228	total: 150ms	remaining: 11.4s
13:	learn: 0.5905433	total: 157ms	remaining: 11.1s
14:	learn: 0.5862676	total: 164ms	remaining: 10.8s
15:	learn: 0.5814304	total: 171ms	remaining: 10.5s
16:	learn: 0.5769771	total: 177ms	remaining: 10.3s
17:	learn: 0.5723741	total: 184ms	remaining: 10s
18:	learn: 0.5682998	total: 191ms	remaining: 9.86s
19:	learn:

In [46]:
from sklearn.metrics import roc_auc_score, f1_score, log_loss, recall_score, precision_score, confusion_matrix
# получите предсказания для тестовой выборки
y_pred_proba = pipeline_new.predict_proba(X_test)[:, 1] 

print('f1:', f1_score(y_test, y_pred))
print('roc_auc:', roc_auc_score(y_test, y_pred_proba))

f1: 0.6094570928196147
roc_auc: 0.8215304807599463


In [48]:
# сохранение результата шага
os.makedirs('../models', exist_ok=True) # создание директории, если её ещё нет
with open('../models/fitted_model_new.pkl', 'wb') as fd:
    joblib.dump(pipeline_new, fd)

In [49]:
prediction = y_pred
proba = y_pred_proba

In [50]:
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
err2

0.19829059829059828

In [52]:
proba

array([0.29895732, 0.81845942, 0.00755334, ..., 0.03614992, 0.01746457,
       0.08880947])

In [54]:
# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel() # ваш код здесь #
auc = roc_auc_score(y_test, proba) # ваш код здесь #
precision = precision_score(y_test, prediction) # ваш код здесь #
recall = recall_score(y_test, prediction) # ваш код здесь #
f1 = f1_score(y_test, prediction) # ваш код здесь #
logloss = log_loss(y_test, prediction) # ваш код здесь #

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [55]:
metrics

{'err1': 0.1868945868945869,
 'err2': 0.19829059829059828,
 'auc': 0.8215304807599463,
 'precision': 0.514792899408284,
 'recall': 0.7467811158798283,
 'f1': 0.6094570928196147,
 'logloss': 9.159811630510683}

In [57]:
EXPERIMENT_NAME = 'churn_kruglikovAlex' # название эксперимента
RUN_NAME = "preprocessing_new" 
REGISTRY_MODEL_NAME = 'churn_model_kruglikovAlex_b2c' # название зарегистрированной модели 

In [59]:
X_train.head(5)

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,num_fg__Spline__monthly_charges_sp_0,num_fg__Spline__monthly_charges_sp_1,num_fg__Spline__monthly_charges_sp_2,num_fg__Spline__monthly_charges_sp_3,num_fg__Spline__monthly_charges_sp_4,num_fg__Spline__monthly_charges_sp_5,num_fg__Spline__total_charges_sp_0,num_fg__Spline__total_charges_sp_1,num_fg__Spline__total_charges_sp_2,num_fg__Spline__total_charges_sp_3,num_fg__Spline__total_charges_sp_4,num_fg__Spline__total_charges_sp_5,num_fg__Quantile__monthly_charges,num_fg__Quantile__total_charges,num_fg__RobustScaler__monthly_charges,num_fg__RobustScaler__total_charges,num_fg__Polynomial__1,num_fg__Polynomial__monthly_charges,num_fg__Polynomial__total_charges,num_fg__Polynomial__monthly_charges^2,num_fg__Polynomial__monthly_charges total_charges,num_fg__Polynomial__total_charges^2,num_fg__Polynomial__monthly_charges^3,num_fg__Polynomial__monthly_charges^2 total_charges,num_fg__Polynomial__monthly_charges total_charges^2,num_fg__Polynomial__total_charges^3,num_fg__KBinsDiscretizer__monthly_charges,num_fg__KBinsDiscretizer__total_charges,cat_fg__type_One year,cat_fg__type_Two year,cat_fg__payment_method_Credit card (automatic),cat_fg__payment_method_Electronic check,cat_fg__payment_method_Mailed check,cat_fg__internet_service_Fiber optic,cat_fg__gender_Male
3973,10992,2014-08-01,,Two year,No,Mailed check,96.6,6424.25,Fiber optic,Yes,Yes,Yes,Yes,No,No,Male,0,Yes,No,Yes,0,0.0,0.001573,0.17169,0.596771,0.225891,0.004074457,0.0,0.003087,0.207981,0.598664,0.1880878,0.002181,0.836547,0.924918,0.484065,1.478315,1.0,96.6,6424.25,9331.56,620582.55,41270990.0,901428.696,59948270.0,3986777000.0,265135100000.0,3.0,3.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
1967,8987,2019-02-01,2020-01-01,Month-to-month,Yes,Electronic check,74.95,825.7,Fiber optic,No,No,No,No,No,No,Male,0,Yes,No,Yes,1,0.0,0.024051,0.391047,0.517404,0.067486,1.131054e-05,0.018273,0.359581,0.539846,0.082249,5.010867e-05,0.0,0.553137,0.367227,0.084065,-0.172347,1.0,74.95,825.7,5617.5025,61886.215,681780.5,421031.812375,4638372.0,51099450.0,562946200.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
4610,11630,2019-06-01,,Month-to-month,Yes,Bank transfer (automatic),74.05,600.15,Fiber optic,No,No,Yes,No,No,No,Female,1,No,No,No,0,0.0,0.02609,0.400709,0.509871,0.063324,6.200353e-06,0.023417,0.387903,0.519791,0.068877,1.350158e-05,0.0,0.532411,0.310808,0.067436,-0.238848,1.0,74.05,600.15,5483.4025,44441.1075,360180.0,406045.955125,3290864.0,26671330.0,216162000.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3021,10041,2015-09-01,2019-11-01,Month-to-month,Yes,Electronic check,71.05,3444.85,Fiber optic,No,No,No,No,No,No,Female,1,Yes,No,No,1,0.0,0.033831,0.432381,0.482999,0.050789,2.763155e-07,8e-05,0.088713,0.547931,0.34699,0.01628579,0.0,0.50954,0.723707,0.012009,0.599876,1.0,71.05,3444.85,5048.1025,244756.5925,11866990.0,358667.682625,17389960.0,843149700.0,40880010000.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3586,10606,2020-01-01,,Month-to-month,No,Electronic check,46.3,46.3,DSL,No,No,No,No,No,No,Male,1,No,No,No,0,0.001587,0.172116,0.596844,0.225408,0.004046,0.0,0.040619,0.45515,0.461496,0.042735,6.760281e-11,0.0,0.297191,0.042365,-0.445266,-0.402143,1.0,46.3,46.3,2143.69,2143.69,2143.69,99252.847,99252.85,99252.85,99252.85,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [60]:
X_tr_transformed = preprocessor.fit_transform(X_train, y_train)

In [61]:
X_tr = pd.DataFrame(X_tr_transformed, columns=preprocessor.get_feature_names_out())
X_tr.head()

Unnamed: 0,num_fg__Spline__monthly_charges_sp_0,num_fg__Spline__monthly_charges_sp_1,num_fg__Spline__monthly_charges_sp_2,num_fg__Spline__monthly_charges_sp_3,num_fg__Spline__monthly_charges_sp_4,num_fg__Spline__monthly_charges_sp_5,num_fg__Spline__total_charges_sp_0,num_fg__Spline__total_charges_sp_1,num_fg__Spline__total_charges_sp_2,num_fg__Spline__total_charges_sp_3,num_fg__Spline__total_charges_sp_4,num_fg__Spline__total_charges_sp_5,num_fg__Quantile__monthly_charges,num_fg__Quantile__total_charges,num_fg__RobustScaler__monthly_charges,num_fg__RobustScaler__total_charges,num_fg__Polynomial__1,num_fg__Polynomial__monthly_charges,num_fg__Polynomial__total_charges,num_fg__Polynomial__monthly_charges^2,num_fg__Polynomial__monthly_charges total_charges,num_fg__Polynomial__total_charges^2,num_fg__Polynomial__monthly_charges^3,num_fg__Polynomial__monthly_charges^2 total_charges,num_fg__Polynomial__monthly_charges total_charges^2,num_fg__Polynomial__total_charges^3,num_fg__KBinsDiscretizer__monthly_charges,num_fg__KBinsDiscretizer__total_charges,cat_fg__type_One year,cat_fg__type_Two year,cat_fg__payment_method_Credit card (automatic),cat_fg__payment_method_Electronic check,cat_fg__payment_method_Mailed check,cat_fg__internet_service_Fiber optic,cat_fg__gender_Male
0,0.0,0.001582,0.171972,0.596819,0.225571,0.004055288,0.0,0.003087,0.207982,0.598664,0.1880864,0.002181,0.83487,0.923742,0.482472,1.478089,1.0,96.6,6424.25,9331.56,620582.55,41270990.0,901428.696,59948270.0,3986777000.0,265135100000.0,3.0,3.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
1,0.0,0.024196,0.391752,0.516865,0.067177,1.085825e-05,0.018274,0.359587,0.539843,0.082246,5.009741e-05,0.0,0.555556,0.367763,0.083026,-0.17177,1.0,74.95,825.7,5617.5025,61886.215,681780.5,421031.812375,4638372.0,51099450.0,562946200.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.026246,0.401424,0.509302,0.063023,5.907119e-06,0.023418,0.387908,0.519786,0.068874,1.349725e-05,0.0,0.533189,0.30974,0.066421,-0.238238,1.0,74.05,600.15,5483.4025,44441.1075,360180.0,406045.955125,3290864.0,26671330.0,216162000.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.034034,0.43312,0.48233,0.050515,2.466835e-07,8e-05,0.088715,0.547934,0.346986,0.01628521,0.0,0.509181,0.72276,0.01107,0.600078,1.0,71.05,3444.85,5048.1025,244756.5925,11866990.0,358667.682625,17389960.0,843149700.0,40880010000.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.001618,0.173039,0.596997,0.224363,0.003983,0.0,0.040621,0.455156,0.461491,0.042733,6.711405e-11,0.0,0.295367,0.041687,-0.445572,-0.401454,1.0,46.3,46.3,2143.69,2143.69,2143.69,99252.847,99252.85,99252.85,99252.85,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [62]:
# ваш код здесь

pip_requirements = "../requirements.txt" # ваш код здесь
signature = mlflow.models.infer_signature(
    X_tr,
    prediction.astype(int)
)
# ваш код здесь
input_example = X_test[:10] # ваш код здесь
metadata = metadata = {'model_type': 'monthly'} # ваш код здесь

In [63]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [64]:
# создаём новый эксперимент в MLflow с указанным названием 
# если эксперимент с таким именем уже существует, 
# MLflow возвращает идентификатор существующего эксперимента
#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) # ваш код здесь
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

In [65]:
experiment_id

'1'

In [66]:
class CatboostModelProba(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model

    def predict(self, context, model_input):
        import numpy as np
        predictions = np.sqrt(self._model.predict(model_input))

        return predictions

In [67]:
custom_model = CatboostModelProba(pipeline_new) 

In [69]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" # ваш код здесь
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [73]:
with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id # ваш код здесь
    
    model_info = mlflow.pyfunc.log_model( 
        # ваш код здесь #
        python_model=custom_model,
        #cb_model=model,
        signature=signature,
        pip_requirements=pip_requirements,
        metadata = metadata,
        input_example = input_example,
        artifact_path="models",
        await_registration_for=60,
        registered_model_name=REGISTRY_MODEL_NAME,
        )

    # логируем метрики эксперимента
    # где ключи — это названия метрик, а значения — числовые значения метрик
    mlflow.log_metrics(metrics)

    # логируем файл как артефакт эксперимента — 'users_churn.csv'
    mlflow.log_artifact("/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv", "dataframe")

Registered model 'churn_model_kruglikovAlex_b2c' already exists. Creating a new version of this model...
2025/07/18 10:55:40 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_kruglikovAlex_b2c, version 2
Created version '2' of model 'churn_model_kruglikovAlex_b2c'.


In [74]:
run_id

'65d9918d1456477fa3c0beb1a797c684'

In [75]:
client = mlflow.MlflowClient()
model_metadata = client.get_latest_versions(REGISTRY_MODEL_NAME, stages=["None"])
latest_model_version = model_metadata[0].version
latest_model_version

'2'