In [None]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from autofeat import AutoFeatClassifier
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score, 
    log_loss, confusion_matrix, mean_squared_error, r2_score
)
from catboost import CatBoostClassifier
from dotenv import load_dotenv, find_dotenv
import psycopg
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression


from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [None]:
# подгружаем .env
load_dotenv()

In [None]:
TABLE_NAME = "alt_users_churn" # таблица с данными в postgres 
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"

# Параметры для трекинга эксперимента
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_experiment_imartnv"
RUN_NAME = "feature_selection" 
REGISTRY_MODEL_NAME = "churn_model_martynov_alexey"

FS_ASSETS = "fs_assets" 

# Настрофка отображения
pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

sns.set_style("white")
sns.set_theme(style="whitegrid") 

In [None]:
# Подключение к базе и получение данных
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

In [None]:
#Функция удаления дубликатов
def remove_duplicates(data):
    feature_cols = data.columns.drop('customer_id').tolist()
    is_duplicated_features = data.duplicated(subset=feature_cols, keep=False)
    data = data[~is_duplicated_features].reset_index(drop=True)
    return data

In [None]:
#Функция для заполнения пропусков
def fill_missing_values(data):
    cols_with_nans = data.isnull().sum()
    cols_with_nans = cols_with_nans[cols_with_nans > 0].index.drop('end_date')
    for col in cols_with_nans:
        if data[col].dtype in [float, int]:
            fill_value = data[col].mean()
        elif data[col].dtype == 'object':
            fill_value = data[col].mode().iloc[0]
        data[col] = data[col].fillna(fill_value)
    return data

In [None]:
#Функция удаления выбросов
def remove_outliers(df: pd.DataFrame, threshold: float = 1.5) -> pd.DataFrame:
        num_cols = df.select_dtypes(include=['float']).columns
        potential_outliers = pd.DataFrame(False, index=df.index, columns=num_cols)
        
        for col in num_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            margin = threshold * IQR
            lower = Q1 - margin
            upper = Q3 + margin
            potential_outliers[col] = ~df[col].between(lower, upper)
        
        outliers = potential_outliers.any(axis=1)
        df_cleaned = df[~outliers]
        return df_cleaned

In [None]:
#Почистим датасет
df = fill_missing_values(df)
df = remove_duplicates(df)
df = remove_outliers(df)

In [None]:
df = df.set_index('id')
df = df.drop(columns=['customer_id','begin_date','end_date'])

In [30]:
num_columns = ["monthly_charges", "total_charges"]
num_df = df[num_columns]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None


# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_spl.get_feature_names_out(num_columns))
num_df = pd.concat([num_df, encoded_df], axis=1)


# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)
encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_q.get_feature_names_out(num_columns))

encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_rb.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_pol.get_feature_names_out(num_columns))
encoded_df.drop(encoded_df.columns[:1 + len(num_columns)], axis=1, inplace=True)
num_df = pd.concat([num_df, encoded_df], axis=1)

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_kbd.get_feature_names_out(num_columns))

encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

Unnamed: 0,monthly_charges,total_charges,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5,monthly_charges_q_100,total_charges_q_100,monthly_charges_robust,total_charges_robust,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3,monthly_charges_bin,total_charges_bin
17,20.65,1022.95,0.0,0.001993,0.183362,0.598305,0.212994,0.003346,0.0,0.01271,0.321173,0.562876,0.1030608,0.000181,0.820707,0.866039,0.458142,1.200842,9072.5625,520507.9125,29862400.0,864161.578125,49578380.0,2844394000.0,163187600000.0,3.0,3.0
59,24.95,894.3,0.037366,0.44473,0.471576,0.046328,2.170599e-08,0.0,0.031877,0.425038,0.489533,0.053551,7.327987e-07,0.0,0.040404,0.212313,-0.933763,-0.325363,384.16,5868.24,89640.36,7529.536,115017.5,1756951.0,26838320.0,0.0,0.0


In [42]:
cat_columns = df.select_dtypes(include='object').columns.values
numeric_transformer = ColumnTransformer(
    transformers=[
        ('spline', SplineTransformer(n_knots=n_knots, degree=degree_spline), num_columns),
        ('quantile', QuantileTransformer(n_quantiles=n_quantiles), num_columns),
        ('robust', RobustScaler(), num_columns),
        ('polynomial', PolynomialFeatures(degree=degree), num_columns),
        ('kbins', KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample), num_columns),
    ]
)

categorical_transformer = Pipeline(
	steps=[
        ('onehot',OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
	('num',numeric_transformer,num_columns),
    ('cat',categorical_transformer,cat_columns)
    ]
)

encoded_features = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

In [49]:
df_output = pd.concat([df['target'], transformed_df], axis=1).dropna()

In [50]:
X = df_output.drop(columns='target')
y = df_output['target']

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
X_train

Unnamed: 0,num__spline__monthly_charges_sp_0,num__spline__monthly_charges_sp_1,num__spline__monthly_charges_sp_2,num__spline__monthly_charges_sp_3,num__spline__monthly_charges_sp_4,num__spline__monthly_charges_sp_5,num__spline__total_charges_sp_0,num__spline__total_charges_sp_1,num__spline__total_charges_sp_2,num__spline__total_charges_sp_3,num__spline__total_charges_sp_4,num__spline__total_charges_sp_5,num__quantile__monthly_charges,num__quantile__total_charges,num__robust__monthly_charges,num__robust__total_charges,num__polynomial__1,num__polynomial__monthly_charges,num__polynomial__total_charges,num__polynomial__monthly_charges^2,num__polynomial__monthly_charges total_charges,num__polynomial__total_charges^2,num__polynomial__monthly_charges^3,num__polynomial__monthly_charges^2 total_charges,num__polynomial__monthly_charges total_charges^2,num__polynomial__total_charges^3,num__kbins__monthly_charges,num__kbins__total_charges,cat__type_One year,cat__type_Two year,cat__paperless_billing_Yes,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__online_security_Yes,cat__online_backup_Yes,cat__device_protection_Yes,cat__tech_support_Yes,cat__streaming_tv_Yes,cat__streaming_movies_Yes,cat__gender_Male,cat__partner_Yes,cat__dependents_Yes,cat__multiple_lines_Yes
1634,0.000000,6.708982e-04,0.137609,0.586145,2.684180e-01,7.157175e-03,0.018401,3.603553e-01,0.539333,0.081863,4.859838e-05,0.000000,0.885281,0.368087,0.561178,-0.171671,1.0,100.85,819.55,10170.7225,8.265162e+04,6.716622e+05,1.025717e+06,8.335416e+06,6.773713e+07,5.504608e+08,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
1115,0.000000,1.458285e-02,0.335266,0.554993,9.504039e-02,1.183250e-04,0.000000,1.090798e-02,0.306164,0.570524,1.121309e-01,0.000274,0.622475,0.873714,0.179393,1.236521,1.0,80.10,5585.40,6416.0100,4.473905e+05,3.119669e+07,5.139224e+05,3.583598e+07,2.498855e+09,1.742460e+11,3.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
1337,0.000060,8.467467e-02,0.542990,0.354780,1.749519e-02,0.000000e+00,0.036197,4.407690e-01,0.475300,0.047734,5.952251e-08,0.000000,0.401062,0.152066,-0.214351,-0.364011,1.0,58.70,168.60,3445.6900,9.896820e+03,2.842596e+04,2.022620e+05,5.809433e+05,1.668604e+06,4.792617e+06,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2194,0.000000,6.534983e-13,0.041999,0.459327,4.573373e-01,4.133598e-02,0.000000,2.178678e-07,0.050226,0.481618,4.339059e-01,0.034250,0.999751,0.996776,0.888684,2.091095,1.0,118.65,8477.60,14077.8225,1.005867e+06,7.186970e+07,1.670334e+06,1.193461e+08,8.527340e+09,6.092826e+11,4.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
1607,0.038448,4.482880e-01,0.468181,0.045084,6.534983e-09,0.000000e+00,0.009308,2.913340e-01,0.577297,0.121664,3.974083e-04,0.000000,0.013468,0.493122,-0.940202,-0.008170,1.0,19.25,1372.90,370.5625,2.642833e+04,1.884854e+06,7.133328e+03,5.087453e+05,3.628345e+07,2.587717e+09,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3718,0.000000,5.544716e-03,0.248070,0.592230,1.531300e-01,1.024842e-03,0.019054,3.642504e-01,0.536718,0.079936,4.150932e-05,0.000000,0.726038,0.359917,0.332107,-0.180816,1.0,88.40,788.60,7814.5600,6.971224e+04,6.218900e+05,6.908071e+05,6.162562e+06,5.497507e+07,4.904224e+08,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5154,0.000000,3.571334e-02,0.439097,0.476855,4.833458e-02,8.516456e-08,0.039717,4.523444e-01,0.464251,0.043687,8.379784e-10,0.000000,0.501263,0.073048,0.000920,-0.393027,1.0,70.40,70.40,4956.1600,4.956160e+03,4.956160e+03,3.489137e+05,3.489137e+05,3.489137e+05,3.489137e+05,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5189,0.035126,4.370372e-01,0.478757,0.049080,1.270930e-07,0.000000e+00,0.041043,4.564489e-01,0.460211,0.042297,8.344195e-12,0.000000,0.131313,0.031665,-0.919963,-0.403457,1.0,20.35,35.10,414.1225,7.142850e+02,1.232010e+03,8.427393e+03,1.453570e+04,2.507140e+04,4.324355e+04,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5352,0.000000,4.654029e-03,0.235156,0.595154,1.637150e-01,1.320043e-03,0.000000,2.292613e-02,0.385426,0.521648,6.998495e-02,0.000015,0.745690,0.835168,0.356026,1.049647,1.0,89.70,4952.95,8046.0900,4.442796e+05,2.453171e+07,7.217343e+05,3.985188e+07,2.200495e+09,1.215044e+11,3.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0


In [65]:
# Инициализируем оценщик: случайный лес с 300 деревьями
estimator = RandomForestClassifier(n_estimators=10)

In [66]:
sfs = SFS(estimator,
          k_features=5,
          forward=True,       # прямой отбор (SFS)
          floating=False,     # отключаем floating
          scoring='roc_auc',  # метрика оценки
          cv=2,
          n_jobs=-1)

In [67]:
sfs = sfs.fit(X_train_features, y_train)

In [70]:
top_sfs = list(sfs.k_feature_names_) 

In [71]:
top_sfs

['num__polynomial__1',
 'cat__type_One year',
 'cat__type_Two year',
 'cat__tech_support_Yes',
 'cat__dependents_Yes']