In [1]:
import pandas as pd
import catboost as cb
import numpy as np
import json

pd.options.display.max_columns = 30

In [2]:
df = pd.read_csv("../data/train.csv", parse_dates=['transaction_time'])

df['hour'] = df['transaction_time'].dt.hour
df['day_of_week'] = df['transaction_time'].dt.dayofweek
df['diff_lat'] = df['lat'] - df['merchant_lat']
df['diff_lon'] = df['lon'] - df['merchant_lon']
df['name_join'] = df['name_1'] + ' ' + df['name_2']

In [3]:
def haversine_vec(lat1, lon1, lat2, lon2):
    phi1 = np.radians(lat1); phi2 = np.radians(lat2)
    dphi = np.radians(lat2 - lat1); dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2.0)**2 + np.cos(phi1)*np.cos(phi2)*(np.sin(dlambda/2.0)**2)
    return np.arctan2(np.sqrt(a), np.sqrt(1-a))

df['haversine_km'] = haversine_vec(df['lat'], df['lon'], df['merchant_lat'], df['merchant_lon'])

In [4]:
categorical_features = ["name_1", "gender", "us_state", 'cat_id']
numerical_features = ["amount", "haversine_km", 'population_city']
temp_cols = ['merch', 'jobs', 'name_join']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[categorical_features + numerical_features + temp_cols], df['target'], test_size=0.25, random_state=42)

In [5]:
merchant_count_dict = X_train['merch'].value_counts().to_dict()
X_train['merch_count'] = X_train['merch'].map(merchant_count_dict)
X_test['merch_count'] = X_test['merch'].map(merchant_count_dict)

merchant_jobs_count_dict = X_train.groupby('merch')['jobs'].nunique().to_dict()
X_train['merch_jobs_count'] = X_train['merch'].map(merchant_jobs_count_dict)
X_test['merch_jobs_count'] = X_test['merch'].map(merchant_jobs_count_dict)

name_count_dict = X_train['name_join'].value_counts().to_dict()
X_train['name_count'] = X_train['name_join'].map(name_count_dict)
X_test['name_count'] = X_test['name_join'].map(name_count_dict)

name_merchant_count_dict = X_train.groupby(['name_join', 'merch']).size().to_dict()
X_train['name_merchant_count'] = X_train.set_index(['name_join','merch']).index.map(name_merchant_count_dict)
X_test['name_merchant_count'] = X_test.set_index(['name_join','merch']).index.map(name_merchant_count_dict)
X_train.drop(columns=temp_cols, inplace=True)
X_test.drop(columns=temp_cols, inplace=True)

stats = {
    "merchant_count_dict": merchant_count_dict,
    "merchant_jobs_count_dict": merchant_jobs_count_dict,
    "name_count_dict": name_count_dict,
    "name_merchant_count_dict": {str(k): v for k, v in name_merchant_count_dict.items()}
}
with open("../data/models/stats.json", "w", encoding='utf-8') as f:
    json.dump(stats, f, ensure_ascii=False, indent=2)

In [6]:
model = cb.CatBoostClassifier(
    depth=4,
    iterations=400,

    cat_features=categorical_features,
    eval_metric="AUC",
    random_seed=42,
    verbose=100,
    train_dir=None,
    allow_writing_files=False,
    thread_count=6,
)

model.fit(cb.Pool(X_train, y_train, cat_features=categorical_features), eval_set=(X_test, y_test))

from sklearn.metrics import f1_score
import numpy as np

preds = model.predict_proba(X_test)[:, 1]

# Подбор оптимального threshold по f1 меру
thresholds = np.arange(0.0, 1.01, 0.01)
f1_scores = [f1_score(y_test, preds > t) for t in thresholds]
best_threshold = thresholds[np.argmax(f1_scores)]
best_f1 = np.max(f1_scores)

print(f"Лучший threshold: {best_threshold:.2f}")
print(f"Лучший F1: {best_f1:.4f}")

Learning rate set to 0.228234
0:	test: 0.7401557	best: 0.7401557 (0)	total: 191ms	remaining: 1m 16s
100:	test: 0.9838083	best: 0.9838083 (100)	total: 8.6s	remaining: 25.5s
200:	test: 0.9885079	best: 0.9885123 (198)	total: 16.7s	remaining: 16.6s
300:	test: 0.9903853	best: 0.9903956 (295)	total: 24.9s	remaining: 8.2s
399:	test: 0.9932075	best: 0.9932171 (390)	total: 33s	remaining: 0us

bestTest = 0.9932170979
bestIteration = 390

Shrink model to first 391 iterations.
Лучший threshold: 0.42
Лучший F1: 0.7677


In [7]:
pd.DataFrame({
    "name": X_train.columns,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

Unnamed: 0,name,importance
4,amount,47.441677
3,cat_id,30.704949
7,merch_count,8.595642
9,name_count,3.91026
8,merch_jobs_count,2.70418
6,population_city,2.5809
1,gender,1.187364
0,name_1,0.948541
5,haversine_km,0.873591
2,us_state,0.590908


In [8]:
model.save_model("../data/models/catboost_model.cbm")

In [10]:
df['target'].value_counts()

target
0    781927
1      4504
Name: count, dtype: int64