In [4]:
import catboost as cb
import pandas as pd
import numpy as np

model = cb.CatBoostClassifier()
model.load_model("../data/models/catboost_model.cbm")

def get_feature_importance_dict(model):
    feature_names = model.feature_names_
    importances = model.get_feature_importance()
    feature_importance_dict = dict(zip(feature_names, importances))
    feature_importance_dict = dict(sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True))
    return feature_importance_dict

test_df = pd.read_csv("../data/test.csv", parse_dates=['transaction_time'])

test_df['hour'] = test_df['transaction_time'].dt.hour
test_df['day_of_week'] = test_df['transaction_time'].dt.dayofweek
test_df['diff_lat'] = test_df['lat'] - test_df['merchant_lat']
test_df['diff_lon'] = test_df['lon'] - test_df['merchant_lon']
test_df['name_join'] = test_df['name_1'] + ' ' + test_df['name_2']

def haversine_vec(lat1, lon1, lat2, lon2):
    phi1 = np.radians(lat1); phi2 = np.radians(lat2)
    dphi = np.radians(lat2 - lat1); dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2.0)**2 + np.cos(phi1)*np.cos(phi2)*(np.sin(dlambda/2.0)**2)
    return np.arctan2(np.sqrt(a), np.sqrt(1-a))

test_df['haversine_km'] = haversine_vec(test_df['lat'], test_df['lon'], test_df['merchant_lat'], test_df['merchant_lon'])

categorical_features = ["name_1", "gender", "us_state", 'cat_id']
numerical_features = ["amount", "haversine_km", 'population_city']
temp_cols = ['merch', 'jobs', 'name_join']

import json

# Загрузка словарей с фичами из файла, сохранённого на этапе EDA
with open("../data/models/stats.json", "r", encoding='utf-8') as f:
    stats = json.load(f)

merchant_count_dict = stats["merchant_count_dict"]
merchant_jobs_count_dict = stats["merchant_jobs_count_dict"]
name_count_dict = stats["name_count_dict"]
# Ключи в name_merchant_count_dict были сериализованы в строки ("('name', 'merch')"), нужно приводить обратно к tuple
name_merchant_count_dict = {eval(k): v for k, v in stats["name_merchant_count_dict"].items()}

X_test = test_df[categorical_features + numerical_features + temp_cols]
X_test['merch_count'] = X_test['merch'].map(merchant_count_dict)
X_test['merch_jobs_count'] = X_test['merch'].map(merchant_jobs_count_dict)
X_test['name_count'] = X_test['name_join'].map(name_count_dict)
X_test['name_merchant_count'] = X_test.set_index(['name_join', 'merch']).index.map(name_merchant_count_dict)

X_test.drop(columns=temp_cols, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['merch_count'] = X_test['merch'].map(merchant_count_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['merch_jobs_count'] = X_test['merch'].map(merchant_jobs_count_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['name_count'] = X_test['name_join'].map(name_count_dict)
A

In [5]:
def get_feature_importance_dict(model):
    feature_names = model.feature_names_
    importances = model.get_feature_importance()
    feature_importance_dict = dict(zip(feature_names, importances))
    feature_importance_dict = dict(sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True))
    return feature_importance_dict

In [7]:
threshold = 0.42

preds = model.predict_proba(X_test)[:, 1]
test_df['prediction'] = (preds > threshold).astype(int)

test_df[['prediction']].to_csv('../data/submission.csv', index_label='index')

Unnamed: 0,transaction_time,merch,cat_id,amount,name_1,name_2,gender,street,one_city,us_state,...,jobs,merchant_lat,merchant_lon,hour,day_of_week,diff_lat,diff_lon,name_join,haversine_km,prediction
0,2019-09-14 02:46:00,"fraud_Stokes, Christiansen and Sipes",grocery_net,25.79,Michael,Rodriguez,M,172 Paula Inlet Apt. 650,Cross Plains,TX,...,Chief Operating Officer,31.772057,-99.103183,2,5,0.376143,-0.084017,Michael Rodriguez,0.003341,0
1,2019-07-25 20:30:00,fraud_Thompson-Gleason,health_fitness,87.80,Curtis,Young,M,4319 Watson Shoals Suite 658,Falconer,NY,...,Metallurgist,42.635312,-78.334559,20,3,-0.511412,-0.854941,Curtis Young,0.007092,0
2,2020-01-05 17:27:00,fraud_Friesen Inc,shopping_pos,10.98,Brandy,Quinn,F,9734 Beard Fields Suite 885,Altair,TX,...,"Editor, film/video",29.363782,-95.624994,17,6,0.240918,-0.899906,Brandy Quinn,0.007152,0
3,2019-07-21 00:14:00,"fraud_Jenkins, Hauck and Friesen",gas_transport,71.97,Ashley,Mcdonald,F,3160 Tina Estates Suite 234,Marietta,SC,...,Museum/gallery exhibitions officer,35.142659,-82.489028,0,6,-0.113059,-0.024572,Ashley Mcdonald,0.001002,0
4,2019-03-13 00:45:00,"fraud_Kovacek, Dibbert and Ondricka",grocery_pos,210.50,Kimberly,Rice,F,63991 Destiny Rue Apt. 651,Tyler,TX,...,Sports development officer,31.833016,-94.746542,0,2,0.443784,-0.556558,Kimberly Rice,0.005652,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262139,2019-08-13 21:24:00,fraud_O'Keefe-Wisoky,food_dining,17.39,Scott,Martin,M,7483 Navarro Flats,Freedom,WY,...,"Education officer, museum",42.398393,-110.760528,21,1,0.618807,-0.268672,Scott Martin,0.005668,0
262140,2019-12-19 16:35:00,"fraud_Runolfsdottir, Mueller and Hand",entertainment,5.29,Chris,White,M,98897 Bennett Lodge,Bessemer,AL,...,Radio broadcast assistant,33.928294,-87.548646,16,3,-0.605894,0.582946,Chris White,0.006775,0
262141,2019-07-28 14:34:00,fraud_Dietrich-Fadel,health_fitness,72.75,Katherine,Tucker,F,670 Le Meadows Suite 250,Lakeland,FL,...,Clothing/textile technologist,28.971960,-82.465733,14,6,-0.985460,0.451833,Katherine Tucker,0.009272,0
262142,2019-02-27 10:56:00,fraud_Kunze Inc,grocery_pos,50.14,Nathan,Mendoza,M,767 Adam Mill Apt. 115,Espanola,NM,...,Historic buildings inspector/conservation officer,36.773361,-105.683859,10,2,-0.786761,-0.381541,Nathan Mendoza,0.007371,0


In [13]:
test_df[['prediction']].to_csv('../data/submission.csv', index_label='index')