In [3]:
import os

import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

from dotenv import load_dotenv, find_dotenv
import psycopg
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# подгружаем .env
load_dotenv()

True

In [5]:
TABLE_NAME = "users_churn" # таблица с данными в postgres 

# Параметры для трекинга эксперимента
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_experiment_imartnv"
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = "churn_experiment_imartnv"

# Настрофка отображения
pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

sns.set_style("white")
sns.set_theme(style="whitegrid") 

In [6]:
# Подключение к базе и получение данных
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

In [7]:
df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,11532,0489-WMEMG,2018-03-01,NaT,One year,Yes,Electronic check,49.45,1119.35,DSL,No,No,No,Yes,No,No,Female,0,No,Yes,No,0
1,11534,7435-ZNUYY,2019-08-01,NaT,One year,No,Mailed check,20.6,116.6,,,,,,,,Male,0,No,No,No,0


In [8]:
obj_df = df.select_dtypes(include="object")
cat_columns = ["type", "payment_method", "internet_service", "gender"]

In [None]:
# определение категориальных колонок, которые будут преобразованы
cat_columns = ["type", "payment_method", "internet_service", "gender"]

# создание объекта OneHotEncoder для преобразования категориальных переменных
# auto - автоматическое определение категорий
# ignore - игнорировать ошибки, если встречается неизвестная категория
# max_categories - максимальное количество уникальных категорий
# sparse_output - вывод в виде разреженной матрицы, если False, то в виде обычного массива
# drop="first" - удаляет первую категорию, чтобы избежать ловушки мультиколлинеарности
encoder_oh = OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first')

# применение OneHotEncoder к данным. Преобразование категориальных данных в массив
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())

# преобразование полученных признаков в DataFrame и установка названий колонок
# get_feature_names_out() - получение имён признаков после преобразования
feature_names = encoder_oh.get_feature_names_out()
encoded_df = pd.DataFrame(encoded_features, columns = feature_names)

# конкатенация исходного DataFrame с новым DataFrame, содержащим закодированные категориальные признаки
# axis=1 означает конкатенацию по колонкам
obj_df = pd.concat([obj_df, encoded_df], axis=1)

obj_df.head(2)

In [9]:
num_df = df.select_dtypes(include='float64').dropna()
num_columns = ["monthly_charges", "total_charges"]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

In [None]:
# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree, knots=strategy)
encoded_features = encoder_spl.fit_transform(num_df)

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_spl.get_feature_names_out(num_df.columns)
)
num_df = pd.concat([num_df, encoded_df], axis=1)

In [None]:
# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles, output_distribution=strategy)
encoded_features = encoder_q.fit_transform(num_df[num_columns])

encoded_df = pd.DataFrame(
    encoded_features,
    columns=encoder_q.get_feature_names_out(num_df[num_columns].columns)
)
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [None]:
# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(num_df[num_columns])

encoded_df = pd.DataFrame(
    encoded_features,
    columns=encoder_rb.get_feature_names_out(num_df[num_columns].columns)
)
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [54]:
# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(num_df[num_columns])

encoded_df = pd.DataFrame(
    encoded_features,
    columns = encoder_pol.get_feature_names_out(num_df[num_columns].columns)
)
encoded_df.drop(encoded_df.columns[:1 + len(num_columns)], axis=1, inplace=True)
num_df = pd.concat([num_df, encoded_df], axis=1)

In [63]:
# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="quantile")
encoded_features = encoder_kbd.fit_transform(num_df[num_columns])

encoded_df = pd.DataFrame(
    encoded_features,
    columns = encoder_kbd.get_feature_names_out(num_df[num_columns].columns)
)

encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [10]:
df.dropna(subset=['monthly_charges','total_charges'],inplace=True)
num_df = df[num_columns]

In [11]:
num_columns = ["monthly_charges", "total_charges"]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None


# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_spl.get_feature_names_out(num_columns))
num_df = pd.concat([num_df, encoded_df], axis=1)


# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)
encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_q.get_feature_names_out(num_columns))

encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_rb.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_pol.get_feature_names_out(num_columns))
encoded_df.drop(encoded_df.columns[:1 + len(num_columns)], axis=1, inplace=True)
num_df = pd.concat([num_df, encoded_df], axis=1)

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_kbd.get_feature_names_out(num_columns))

encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


num_df.head(2)

Unnamed: 0,monthly_charges,total_charges,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5,monthly_charges_q_100,total_charges_q_100,monthly_charges_robust,total_charges_robust,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3,monthly_charges_bin,total_charges_bin
0,49.45,1119.35,0.000861,0.146419,0.589877,0.256651,0.006192455,0.0,0.012905,0.322708,0.562049,0.102164,0.0001734096,0.0,0.31746,0.439023,-0.385076,-0.081963,2445.3025,55351.8575,1252944.0,120920.208625,2737149.0,61958100.0,1402483000.0,1.0,0.0
1,20.6,116.6,0.034402,0.434454,0.48112,0.050024,1.99304e-07,0.0,0.03803,0.446926,0.469486,0.045558,1.081408e-08,0.0,0.148148,0.123818,-0.916628,-0.377473,424.36,2401.96,13595.56,8741.816,49480.38,280068.5,1585242.0,0.0,0.0


In [12]:
num_df

Unnamed: 0,monthly_charges,total_charges,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5,monthly_charges_q_100,total_charges_q_100,monthly_charges_robust,total_charges_robust,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3,monthly_charges_bin,total_charges_bin
0,49.45,1119.35,0.000861,0.146419,0.589877,0.256651,6.192455e-03,0.000000,0.012905,0.322708,0.562049,0.102164,1.734096e-04,0.000000e+00,0.317460,0.439023,-0.385076,-0.081963,2445.3025,55351.8575,1.252944e+06,1.209202e+05,2.737149e+06,6.195810e+07,1.402483e+09,1.0,0.0
1,20.60,116.60,0.034402,0.434454,0.481120,0.050024,1.993040e-07,0.000000,0.038030,0.446926,0.469486,0.045558,1.081408e-08,0.000000e+00,0.148148,0.123818,-0.916628,-0.377473,424.3600,2401.9600,1.359556e+04,8.741816e+03,4.948038e+04,2.800685e+05,1.585242e+06,0.0,0.0
2,19.55,68.80,0.037519,0.445239,0.471093,0.046149,1.866457e-08,0.000000,0.039776,0.452531,0.464069,0.043623,7.387781e-10,0.000000e+00,0.035354,0.070112,-0.935974,-0.391560,382.2025,1345.0400,4.733440e+03,7.472059e+03,2.629553e+04,9.253875e+04,3.256607e+05,0.0,0.0
3,99.00,287.40,0.000000,0.000994,0.151934,0.591840,2.495770e-01,0.005655,0.032257,0.426495,0.488252,0.052996,6.152591e-07,0.000000e+00,0.856242,0.208088,0.527867,-0.327139,9801.0000,28452.6000,8.259876e+04,9.702990e+05,2.816807e+06,8.177277e+06,2.373888e+07,4.0,0.0
4,93.50,2341.55,0.000000,0.002656,0.199058,0.598954,1.967781e-01,0.002553,0.001930,0.181722,0.598146,0.214761,3.440677e-03,0.000000e+00,0.789141,0.628964,0.426532,0.278218,8742.2500,218934.9250,5.482856e+06,8.174004e+05,2.047042e+07,5.126471e+08,1.283838e+10,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062,,,0.036308,0.441151,0.474944,0.047598,5.458084e-08,0.000000,0.029609,0.415995,0.497325,0.057069,1.871021e-06,0.000000e+00,0.080808,0.242016,-0.928604,-0.301765,398.0025,7451.3250,1.395022e+05,7.940150e+03,1.486539e+05,2.783070e+06,5.210409e+07,0.0,0.0
4056,,,0.000000,0.005843,0.252095,0.591166,1.499510e-01,0.000945,0.000000,0.004789,0.237207,0.594741,1.619941e-01,1.268949e-03,0.723451,0.910796,0.325196,1.404073,7744.0000,542247.2000,3.796901e+07,6.814720e+05,4.771775e+07,3.341273e+09,2.339613e+11,3.0,3.0
4551,,,0.036458,0.441663,0.474464,0.047415,4.843734e-08,0.000000,0.025125,0.396218,0.513411,0.065238,8.295655e-06,0.000000e+00,0.075758,0.294566,-0.929526,-0.254613,396.0100,10616.6500,2.846222e+05,7.880599e+03,2.112713e+05,5.663983e+06,1.518460e+08,0.0,0.0
5120,,,0.000081,0.088897,0.548147,0.346642,1.623329e-02,0.000000,0.010637,0.303765,0.571673,0.113633,2.913656e-04,0.000000e+00,0.397306,0.472613,-0.228466,-0.037036,3358.2025,73700.8100,1.617475e+06,1.946078e+05,4.270962e+06,9.373269e+07,2.057105e+09,1.0,0.0


In [13]:
numeric_transformer = ColumnTransformer(
    transformers=[
        ('spline', SplineTransformer(n_knots=n_knots, degree=degree_spline), num_columns),
        ('quantile', QuantileTransformer(n_quantiles=n_quantiles), num_columns),
        ('robust', RobustScaler(), num_columns),
        ('polynomial', PolynomialFeatures(degree=degree), num_columns),
        ('kbins', KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample), num_columns),
    ]
)

categorical_transformer = Pipeline(
	steps=[
        ('onehot',OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
	('num',numeric_transformer,num_columns),
    ('cat',categorical_transformer,cat_columns)
    ]
)

encoded_features = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

df = pd.concat([df.reset_index(drop=True), transformed_df.reset_index(drop=True)], axis=1)
df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,num__spline__monthly_charges_sp_0,num__spline__monthly_charges_sp_1,num__spline__monthly_charges_sp_2,num__spline__monthly_charges_sp_3,num__spline__monthly_charges_sp_4,num__spline__monthly_charges_sp_5,num__spline__total_charges_sp_0,num__spline__total_charges_sp_1,num__spline__total_charges_sp_2,num__spline__total_charges_sp_3,num__spline__total_charges_sp_4,num__spline__total_charges_sp_5,num__quantile__monthly_charges,num__quantile__total_charges,num__robust__monthly_charges,num__robust__total_charges,num__polynomial__1,num__polynomial__monthly_charges,num__polynomial__total_charges,num__polynomial__monthly_charges^2,num__polynomial__monthly_charges total_charges,num__polynomial__total_charges^2,num__polynomial__monthly_charges^3,num__polynomial__monthly_charges^2 total_charges,num__polynomial__monthly_charges total_charges^2,num__polynomial__total_charges^3,num__kbins__monthly_charges,num__kbins__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_None,cat__gender_Male
0,11532,0489-WMEMG,2018-03-01,NaT,One year,Yes,Electronic check,49.45,1119.35,DSL,No,No,No,Yes,No,No,Female,0,No,Yes,No,0,0.000861,0.146419,0.589877,0.256651,0.006192455,0.0,0.012905,0.322708,0.562049,0.102164,0.0001734096,0.0,0.31746,0.439023,-0.385076,-0.081963,1.0,49.45,1119.35,2445.3025,55351.8575,1252944.0,120920.208625,2737149.0,61958100.0,1402483000.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,11534,7435-ZNUYY,2019-08-01,NaT,One year,No,Mailed check,20.6,116.6,,,,,,,,Male,0,No,No,No,0,0.034402,0.434454,0.48112,0.050024,1.99304e-07,0.0,0.03803,0.446926,0.469486,0.045558,1.081408e-08,0.0,0.148148,0.123818,-0.916628,-0.377473,1.0,20.6,116.6,424.36,2401.96,13595.56,8741.816,49480.38,280068.5,1585242.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [31]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = "YCAJE2PE_Dej-cO6JiS4EkKFf"
os.environ["AWS_SECRET_ACCESS_KEY"] = "YCNa-0SANmXmZFFtTZ96Wn8xa1nBidlx3M7qXMIA"
os.environ["AWS_DEFAULT_REGION"] = "ru-central1"
os.environ["AWS_S3_SIGNATURE_VERSION"] = "s3v4"
os.environ["S3_USE_PATH_STYLE_ENDPOINT"] = "true"

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.sklearn.log_model(preprocessor, "column_transformer") 
    
    print(f"Run ID: {run_id}")



S3UploadFailedError: Failed to upload /tmp/tmpnhqo_fky/model/python_env.yaml to s3-student-mle-20241125-59b9e9f709/4/ae7ea5df21e74129a37de5902802709f/artifacts/column_transformer/python_env.yaml: An error occurred (SignatureDoesNotMatch) when calling the PutObject operation: The request signature we calculated does not match the signature you provided. Check your key and signing method.

In [33]:
import datetime

# Получаем текущие дату и время
current_time = datetime.datetime.now()
print("Текущее время:", current_time)

Текущее время: 2025-02-05 14:11:07.085216


In [27]:
pip install boto3==1.35.0 botocore==1.35.0пше

Collecting boto3==1.35.0
  Downloading boto3-1.35.0-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 KB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting botocore==1.35.0
  Downloading botocore-1.35.0-py3-none-any.whl (12.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting s3transfer<0.11.0,>=0.10.0
  Using cached s3transfer-0.10.4-py3-none-any.whl (83 kB)
Installing collected packages: botocore, s3transfer, boto3
  Attempting uninstall: botocore
    Found existing installation: botocore 1.36.13
    Uninstalling botocore-1.36.13:
      Successfully uninstalled botocore-1.36.13
  Attempting uninstall: s3transfer
    Found existing installation: s3transfer 0.11.2
    Uninstalling s3transfer-0.11.2:
      Successfully uninstalled s3transfer-0.11.2
  Attempting uninstall: boto3
    Found existing instal