In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
# здесь меняю рабочую папку, чтобы путь не прописывать к ней все время
import os
for dirpath, dirnames, _ in os.walk('/content/drive/MyDrive'):
    if 'hackathon' in dirnames:
        os.chdir(os.path.join(dirpath, 'hackathon'))
        break

# Импорт библиотек

In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [53]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Импорт данных

In [13]:
train_df = pd.read_csv('hackathon_income_train.csv', decimal=',', sep=';')
test_df = pd.read_csv('hackathon_income_test.csv', decimal=',', sep=';')
train_df.shape, test_df.shape

  train_df = pd.read_csv('hackathon_income_train.csv', decimal=',', sep=';')
  test_df = pd.read_csv('hackathon_income_test.csv', decimal=',', sep=';')


((76786, 224), (73214, 222))

In [14]:
train_df.head()

Unnamed: 0,id,dt,target,turn_cur_cr_avg_act_v2,salary_6to12m_avg,hdb_bki_total_max_limit,dp_ils_paymentssum_avg_12m,hdb_bki_total_cc_max_limit,incomeValue,gender,...,dp_ils_uniq_companies_1y,avg_6m_travel,avg_6m_government_services,hdb_bki_active_cc_max_overdue,total_rur_amt_cm_avg_period_days_ago_v2,label_Above_1M_share_r1,transaction_category_supermarket_sum_cnt_d15,max_balance_rur_amt_1m_af,w,first_salary_income
0,2,2024-04-30,109324.476325,1465144.96,,52800.0,365346.244634,23213.0,97366.0,Женский,...,1.0,0.0,57.0,0.0,297.0,0.027027,9.0,,0.301217,
1,4,2024-02-29,25558.028662,303593.66,,260200.0,,10000.0,32580.0,Женский,...,,0.0,707.0,67.0,30245.0,,2.0,,0.6958,
2,5,2024-02-29,40666.753098,490754.01,,2000000.0,,90000.0,96866.0,Женский,...,,422.0,0.0,0.0,210322.0,0.0,20.0,,0.51597,
3,6,2024-04-30,43856.672058,219875.12,,75000.0,,75000.0,43860.0,Мужской,...,,0.0,0.0,0.0,7187.0,0.0,7.0,,0.478003,
4,7,2024-04-30,130420.851992,1750241.845,,1000000.0,,240000.0,83815.0,Женский,...,,0.0,84.0,0.0,690038.0,0.0,9.0,,0.552314,


# Предобрабротка

In [5]:
class DropHighNaN(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.5):
        self.threshold = threshold
        self.cols_ = None

    def fit(self, X, y=None):
        self.cols_ = X.columns[X.isnull().mean() < self.threshold]
        return self

    def transform(self, X):
        return X[self.cols_].copy()

In [7]:
class SplitObjectColumns(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_cols_ = None

    def fit(self, X, y=None):
        categorical = []
        for col in X.select_dtypes(include=['object']).columns:
            try:
                pd.to_numeric(X[col].dropna())
            except ValueError:
                categorical.append(col)
        self.categorical_cols_ = categorical
        return self


    def transform(self, X):
        X = X.copy()
        # объектные категории -> строки
        for col in self.categorical_cols_:
            X[col] = X[col].astype(str).fillna("nan")
        return X

In [70]:
class FillNumericMedian(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.medians_ = {}

    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=np.number).columns
        for col in numeric_cols:
            self.medians_[col] = X[col].median()
        return self

    def transform(self, X):
        X = X.copy()
        for col, median in self.medians_.items():
            X[col] = X[col].fillna(median)
        return X

In [71]:
preprocess = Pipeline(steps=[
    ("drop_nans", DropHighNaN(threshold=0.8)),
    ("split_object", SplitObjectColumns()),
    ("fill_numeric", FillNumericMedian())
])

In [65]:
X = train_df.drop(columns=["target", "id", "dt", "w"])
y = train_df["target"]
w = train_df["w"]

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, test_size=0.2, random_state=42
)

# логарифм таргета
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

In [72]:
preprocess.fit(X_train)
X_train_p = preprocess.transform(X_train)
X_test_p  = preprocess.transform(X_test)

In [47]:
categorical_cols = preprocess.named_steps["split_object"].categorical_cols_
categorical_cols

['gender', 'adminarea', 'city_smart_name', 'addrref']

# Обучение для определения важности признаков

In [48]:
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.03,
    depth=8,
    loss_function="MAE",
    random_seed=42
)

In [49]:
model.fit(
    X_train_p,
    y_train_log,
    cat_features=categorical_cols,
    sample_weight=w_train
)

0:	learn: 0.9489619	total: 252ms	remaining: 2m 5s
1:	learn: 0.9390355	total: 444ms	remaining: 1m 50s
2:	learn: 0.9289406	total: 669ms	remaining: 1m 50s
3:	learn: 0.9196147	total: 904ms	remaining: 1m 52s
4:	learn: 0.9104103	total: 1.12s	remaining: 1m 51s
5:	learn: 0.9016064	total: 1.33s	remaining: 1m 49s
6:	learn: 0.8930488	total: 1.58s	remaining: 1m 51s
7:	learn: 0.8845230	total: 1.77s	remaining: 1m 49s
8:	learn: 0.8762727	total: 1.98s	remaining: 1m 47s
9:	learn: 0.8680945	total: 2.22s	remaining: 1m 48s
10:	learn: 0.8605567	total: 2.44s	remaining: 1m 48s
11:	learn: 0.8532781	total: 2.65s	remaining: 1m 47s
12:	learn: 0.8458398	total: 2.89s	remaining: 1m 48s
13:	learn: 0.8389908	total: 3.1s	remaining: 1m 47s
14:	learn: 0.8320021	total: 3.3s	remaining: 1m 46s
15:	learn: 0.8249606	total: 3.49s	remaining: 1m 45s
16:	learn: 0.8180779	total: 3.7s	remaining: 1m 45s
17:	learn: 0.8117041	total: 3.9s	remaining: 1m 44s
18:	learn: 0.8057278	total: 4.08s	remaining: 1m 43s
19:	learn: 0.7995390	total:

<catboost.core.CatBoostRegressor at 0x7a64b4d04740>

In [92]:
feature_importance = model.get_feature_importance()
columns = X_train_p.columns

df = pd.DataFrame({'feature': columns, 'importance': feature_importance})
df = df.sort_values(by='importance', ascending=False)
print(df.tail(20))

                               feature  importance
118                       lifetimeComp    0.061105
99             days_after_last_request    0.052424
52                   bki_total_oth_cnt    0.050813
129            cred_dda_rur_amt_3m_avg    0.043739
68            days_to_last_transaction    0.043640
51                      blacklist_flag    0.040637
91                    avg_fdep_cr_turn    0.037517
98          vert_has_app_ru_vtb_invest    0.035845
74                    tz_msk_timedelta    0.023553
79   vert_has_app_ru_tinkoff_investing    0.013512
123        hdb_bki_total_pil_max_del90    0.007672
114            hdb_bki_total_micro_cnt    0.007488
109                    businessTelSubs    0.006040
105     vert_has_app_ru_raiffeisennews    0.004624
94                  client_active_flag    0.000000
97                    nonresident_flag    0.000000
101          vert_has_app_ru_cian_main    0.000000
112                           ovrd_sum    0.000000
126             express_rur_amt

In [93]:
threshold = 0.5  # порог importance
keep_features = df[df['importance'] >= threshold]['feature'].tolist()
X_train_final = X_train_p[keep_features]
X_test_final  = X_test_p[keep_features]

In [94]:
X_train_final.shape

(61428, 42)

# Финальное обучение

In [95]:
final_model = CatBoostRegressor(
    iterations=800,
    learning_rate=0.03,
    depth=8,
    loss_function="MAE",
    random_seed=42,
    verbose=100
)

final_model.fit(
    X_train_final,
    y_train_log,
    cat_features=[c for c in keep_features if c in categorical_cols],
    sample_weight=w_train_norm
)

0:	learn: 0.9486256	total: 211ms	remaining: 2m 48s
100:	learn: 0.5843165	total: 10s	remaining: 1m 9s
200:	learn: 0.5380399	total: 20.9s	remaining: 1m 2s
300:	learn: 0.5184187	total: 31.8s	remaining: 52.7s
400:	learn: 0.5047679	total: 42.8s	remaining: 42.6s
500:	learn: 0.4942343	total: 52.3s	remaining: 31.2s
600:	learn: 0.4853826	total: 1m 2s	remaining: 20.8s
700:	learn: 0.4791573	total: 1m 13s	remaining: 10.4s
799:	learn: 0.4739918	total: 1m 24s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7a64b464c1a0>

# Предсказание

In [50]:
def weighted_mean_absolute_error(y_true, y_pred, weights):
    return (weights * np.abs(y_true - y_pred)).mean()

In [103]:
preds_log = final_model.predict(X_test_final)
preds = np.expm1(preds_log)
print("WMAE:", weighted_mean_absolute_error(y_test, preds, w_test))

WMAE: 48859.13556560739


In [104]:
from sklearn.metrics import r2_score

r2_score(y_test, preds)

0.47166954732692945