In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
# здесь меняю рабочую папку, чтобы путь не прописывать к ней все время
import os
for dirpath, dirnames, _ in os.walk('/content/drive/MyDrive'):
    if 'hackathon' in dirnames:
        os.chdir(os.path.join(dirpath, 'hackathon'))
        break

# Импорт библиотек

In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from catboost import CatBoostRegressor

# Импорт данных

In [13]:
train_df = pd.read_csv('hackathon_income_train.csv', decimal=',', sep=';')
test_df = pd.read_csv('hackathon_income_test.csv', decimal=',', sep=';')
train_df.shape, test_df.shape

  train_df = pd.read_csv('hackathon_income_train.csv', decimal=',', sep=';')
  test_df = pd.read_csv('hackathon_income_test.csv', decimal=',', sep=';')


((76786, 224), (73214, 222))

In [14]:
train_df.head()

Unnamed: 0,id,dt,target,turn_cur_cr_avg_act_v2,salary_6to12m_avg,hdb_bki_total_max_limit,dp_ils_paymentssum_avg_12m,hdb_bki_total_cc_max_limit,incomeValue,gender,...,dp_ils_uniq_companies_1y,avg_6m_travel,avg_6m_government_services,hdb_bki_active_cc_max_overdue,total_rur_amt_cm_avg_period_days_ago_v2,label_Above_1M_share_r1,transaction_category_supermarket_sum_cnt_d15,max_balance_rur_amt_1m_af,w,first_salary_income
0,2,2024-04-30,109324.476325,1465144.96,,52800.0,365346.244634,23213.0,97366.0,Женский,...,1.0,0.0,57.0,0.0,297.0,0.027027,9.0,,0.301217,
1,4,2024-02-29,25558.028662,303593.66,,260200.0,,10000.0,32580.0,Женский,...,,0.0,707.0,67.0,30245.0,,2.0,,0.6958,
2,5,2024-02-29,40666.753098,490754.01,,2000000.0,,90000.0,96866.0,Женский,...,,422.0,0.0,0.0,210322.0,0.0,20.0,,0.51597,
3,6,2024-04-30,43856.672058,219875.12,,75000.0,,75000.0,43860.0,Мужской,...,,0.0,0.0,0.0,7187.0,0.0,7.0,,0.478003,
4,7,2024-04-30,130420.851992,1750241.845,,1000000.0,,240000.0,83815.0,Женский,...,,0.0,84.0,0.0,690038.0,0.0,9.0,,0.552314,


# Предобрабротка

In [5]:
class DropHighNaN(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.5):
        self.threshold = threshold
        self.cols_ = None

    def fit(self, X, y=None):
        self.cols_ = X.columns[X.isnull().mean() < self.threshold]
        return self

    def transform(self, X):
        return X[self.cols_].copy()

In [7]:
class SplitObjectColumns(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_cols_ = None

    def fit(self, X, y=None):
        categorical = []
        for col in X.select_dtypes(include=['object']).columns:
            try:
                pd.to_numeric(X[col].dropna())
            except ValueError:
                categorical.append(col)
        self.categorical_cols_ = categorical
        return self


    def transform(self, X):
        X = X.copy()
        # объектные категории -> строки
        for col in self.categorical_cols_:
            X[col] = X[col].astype(str).fillna("nan")
        return X

In [17]:
class FillNumericMedian(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.medians_ = {}

    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=np.number).columns
        for col in numeric_cols:
            self.medians_[col] = X[col].median()
        return self

    def transform(self, X):
        X = X.copy()
        for col, median in self.medians_.items():
            X[col] = X[col].fillna(median)
        return X

In [31]:
preprocess = Pipeline(steps=[
    ("drop_nans", DropHighNaN(threshold=0.5)),
    ("split_object", SplitObjectColumns()),
    ("fill_numeric", FillNumericMedian())
])

In [32]:
X = train_df.drop(columns=["target", "id", "dt", "w"])
y = train_df["target"]
w = train_df["w"]

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, test_size=0.2, random_state=42
)

# логарифм таргета
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

# нормализация весов
w_train_norm = w_train / w_train.mean()

In [33]:
preprocess.fit(X_train)
X_train_p = preprocess.transform(X_train)
X_test_p  = preprocess.transform(X_test)

In [36]:
categorical_cols = preprocess.named_steps["split_object"].categorical_cols_
categorical_cols

['gender', 'adminarea', 'city_smart_name', 'addrref']

# Обучение

In [39]:
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.03,
    depth=8,
    loss_function="MAE",
    random_seed=42
)

In [40]:
model.fit(
    X_train_p,
    y_train_log,
    cat_features=categorical_cols,
    sample_weight=w_train_norm
)

0:	learn: 0.9483996	total: 368ms	remaining: 3m 3s
1:	learn: 0.9386466	total: 556ms	remaining: 2m 18s
2:	learn: 0.9292464	total: 740ms	remaining: 2m 2s
3:	learn: 0.9199551	total: 944ms	remaining: 1m 57s
4:	learn: 0.9106802	total: 1.18s	remaining: 1m 56s
5:	learn: 0.9020810	total: 1.39s	remaining: 1m 54s
6:	learn: 0.8932806	total: 1.57s	remaining: 1m 50s
7:	learn: 0.8852126	total: 1.77s	remaining: 1m 49s
8:	learn: 0.8775665	total: 1.99s	remaining: 1m 48s
9:	learn: 0.8700227	total: 2.21s	remaining: 1m 48s
10:	learn: 0.8624830	total: 2.38s	remaining: 1m 45s
11:	learn: 0.8550629	total: 2.56s	remaining: 1m 44s
12:	learn: 0.8480752	total: 2.81s	remaining: 1m 45s
13:	learn: 0.8407932	total: 3s	remaining: 1m 44s
14:	learn: 0.8341232	total: 3.26s	remaining: 1m 45s
15:	learn: 0.8273586	total: 3.6s	remaining: 1m 48s
16:	learn: 0.8214234	total: 3.98s	remaining: 1m 53s
17:	learn: 0.8153700	total: 4.4s	remaining: 1m 57s
18:	learn: 0.8092786	total: 4.8s	remaining: 2m 1s
19:	learn: 0.8032707	total: 5.1

<catboost.core.CatBoostRegressor at 0x7a64b763d4c0>

# Предсказание

In [41]:
def weighted_mean_absolute_error(y_true, y_pred, weights):
    return (weights * np.abs(y_true - y_pred)).mean()

In [43]:
preds_log = model.predict(X_test_p)
preds = np.expm1(preds_log)
print("WMAE:", weighted_mean_absolute_error(y_test, preds, w_test))

WMAE: 49538.38434522328


In [44]:
from sklearn.metrics import r2_score

r2_score(y_test, preds)

0.45623564848620946