# Hackathon FinTech Case Itmo
## Kirill Zakharov
2022

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sts
from skmultilearn.model_selection import iterative_train_test_split
import os
import json
import sklearn.utils as sku
from sklearn.metrics import fbeta_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

# plt.style.use('ggplot')
# sns.set_palette('mako')
sns.set_style('darkgrid')

In [None]:
# !pip install xgboost
from xgboost import XGBClassifier

In [None]:
PAYMENTS_TRAIN_PATH = '/kaggle/input/itmo-hack-2022-tochka/data_tochka/data/payments_train.csv'
TARGET_TRAIN_PATH = '/kaggle/input/itmo-hack-2022-tochka/data_tochka/data/target_train.csv'
PAYMENTS_TEST_PATH = '/kaggle/input/itmo-hack-2022-tochka/data_tochka/data/payments_test.csv'
CLIENT_ID_TEST_PATH = '/kaggle/input/itmo-hack-2022-tochka/data_tochka/data/client_id.csv'

In [None]:
payments_dtypes = {
    'client_id': str,
    'contractor_id': str,
    'is_outgoing': bool,
    'amount': 'uint64',
    'dt_day': 'uint16',
    'dt_hour': 'uint8',
    'channel': pd.CategoricalDtype()
}
for i in range(12):
    payments_dtypes[f'flag_{i}'] = bool

In [None]:
payments = pd.read_csv(PAYMENTS_TRAIN_PATH, dtype=payments_dtypes)

In [None]:
payments[payments['contractor_id'] == '0']

In [None]:
contractors = payments['contractor_id']

In [None]:
# payments['channel'] = payments['channel'].replace(['app', 'web', 'pos', 'atm'], [1, 2, 3, 4])

In [None]:
# payments['channel'].fillna(0, inplace=True)

In [None]:
# payments['channel'] = payments.groupby('client_id')['channel'].transform(lambda x: x.value_counts().index[0])
# payments['channel'] = payments['channel'].replace(['app', 'web', 'pos', 'atm'], [1, 2, 3, 4])

In [None]:
# all_dfs = []

# # process 100,000 rows at a time
# for chunk in pd.read_csv('January_car_accidents.csv', dtype={'State' : 'category'}, chunksize=100000):
#     # possibly do some processing on chunk
#     all_dfs.append(chunk)
    
# pd.concat(all_dfs).info()

In [None]:
target_dtypes = {
    'client_id': str
}
for i in range(35):
    target_dtypes[f'type_{i}'] = int

In [None]:
target = pd.read_csv(TARGET_TRAIN_PATH, dtype=target_dtypes)
target.head()

In [None]:
payments['client_id'].value_counts() # данные о транзакциях есть по всем клиентам

In [None]:
target = target.set_index('client_id')

In [None]:
# pd.read_csv(CLIENT_ID_TEST_PATH)

## Feature Engineering

In [None]:
def percentage_outgoing(x):
    counts = x.value_counts(normalize=True)
    if len(counts.index) == 2:
        return counts[0]
    
    elif len(counts.index) == 1 and counts.index == 0:
        return 1
    
    elif len(counts.index) == 1 and counts.index == 1:
        return 0
    
    else:
        return 'Error'

In [None]:
def clear_false_data(payments):
    return payments.drop(payments.index[np.where((payments.loc[:, 'flag_0': 'flag_11'] == False).all(axis=1))[0]])

In [None]:
contractor_ids = payments['contractor_id'].value_counts().iloc[:100].index

In [None]:
contractor_ids

In [None]:
payments.loc[~payments['contractor_id'].isin(contractor_ids), 'contractor_id']='0'

In [None]:
payments = pd.get_dummies(payments, columns=['contractor_id'])

In [None]:
payments = pd.get_dummies(payments, columns=['channel'])

In [None]:
payments['contractor_id'] = contractors

In [None]:
for i in range(10):
    payments_copy = payments.copy()
    contractor_ids = payments_copy['contractor_id'].value_counts().iloc[i*100:(i+1)*100].index
    payments_copy.loc[~payments_copy['contractor_id'].isin(contractor_ids), 'contractor_id']='0'
    payments_copy = pd.get_dummies(payments_copy, columns=['contractor_id'])
    payments_copy.to_csv(f'feature_data_{i}.csv', index=False)
    del payments_copy

In [None]:
for i in range(10):
    fd = pd.read_csv(f'/kaggle/working/feature_data_{i}.csv')
    payments = pd.concat([payments, fd], axis=1)
    del fd

In [None]:
payments.columns

In [None]:
from scipy.stats import kurtosis

In [None]:
def quantile_25(x):
    return x.quantile(.25)

def quantile_50(x):
    return x.quantile(.5)

def quantile_75(x):
    return x.quantile(.75)

In [None]:
dict_features = {}
for i in contractor_ids.values:
    dict_features[f'contractor_id_{i}'] = 'sum'

In [None]:
features1 = {'channel_app': 'sum', 'channel_atm': 'sum', 'channel_pos': 'sum','channel_web': 'sum',\
                                        'dt_hour': ['skew', 'mean', 'std', quantile_25, quantile_50, quantile_75], 'dt_day': ['min', 'max', 'skew', 'mean', 'std', quantile_25, quantile_50, quantile_75, 'nunique'], 'is_outgoing': ['sum', 'count'], \
                                        'contractor_id':'nunique'}

In [None]:
features1.update(dict_features)

In [None]:
df = payments.groupby('client_id', as_index=False).agg(features1)

df.columns = ['_'.join(col).rstrip('_') for col in df.columns.values]

In [None]:
df = df.set_index('client_id')

In [None]:
cols = ['mean', 'median', 'std', 'min', 'max']+(list(payments.columns.values))

In [None]:
def generate_features(payments):
#     payments = clear_false_data(payments)
    
    groupby_client = payments.groupby('client_id')
    
    fts = groupby_client['amount'].agg(['mean', 'median', 'std', 'min', 'max', 'sum'])
    fts[[f'flag_{i}_count' for i in range(12)]] = groupby_client[[f'flag_{i}' for i in range(12)]].sum()
    
#     is_outgoing_counts = groupby_client.apply(lambda x: percentage_outgoing(x['is_outgoing']))
    is_outgoing_counts = groupby_client['is_outgoing'].apply(lambda x: percentage_outgoing(x))
    fts['0% outgoing'] = is_outgoing_counts
    
    fts = fts.merge(df, left_index = True, right_index=True)
    
    fts['flags_sum'] = groupby_client.apply(lambda x: sum(x.loc[:, 'flag_0':'flag_11'].sum()))
    
#     dt_day_kurt = groupby_client.apply(lambda x: x['dt_day'].kurtosis())
#     fts['dt_day_kurtosis'] = dt_day_kurt

#     dt_hour_kurt = groupby_client.apply(lambda x: x['dt_hour'].kurtosis())
#     fts['dt_hour_kurtosis'] = dt_hour_kurt

#     most_channel = groupby_client.apply(lambda x: x['channel'].value_counts().index[0])
#     fts['most channel'] = most_channel

    return fts

In [None]:
def generate_features(payments):
    contractors = payments['contractor_id']
    contractor_ids = payments['contractor_id'].value_counts().head(200).index
    payments.loc[~payments['contractor_id'].isin(contractor_ids), 'contractor_id']='0'
    
#     encoder = OneHotEncoder()
#     dummies_contractors = encoder.fit_transform(payments['contractor_id'].values.reshape(-1, 1))
    
#     contractors_one_hot = pd.DataFrame.sparse.from_spmatrix(dummies_contractors)
#     contractors_one_hot.columns = [str(col) for col in contractors_one_hot.columns.values]
    
#     payments = pd.concat([payments, contractors_one_hot], axis=1)
#     payments.iloc[:, 19:] = payments.iloc[:, 19:].sparse.to_dense()
    
    payments = pd.get_dummies(payments, columns=['contractor_id'])
    payments = pd.get_dummies(payments, columns=['channel'])
    payments['contractor_id'] = contractors
    
    dict_features = {}
    for i in contractor_ids.values:
        dict_features[f'contractor_id_{i}'] = 'sum'
        
#     dict_features = {}
#     for i in range(len(contractor_ids)):
#         dict_features[f'{i}'] = 'sum'    
        
    features1 = {'channel_app': 'sum', 'channel_atm': 'sum', 'channel_pos': 'sum','channel_web': 'sum',\
                                        'dt_hour': ['mean', 'std'], 'dt_day': ['min', 'max', 'skew', 'mean', 'std', 'nunique'], 'is_outgoing': ['sum', 'count'], \
                                        'contractor_id':'nunique'}
    features1.update(dict_features)
    
    df = payments.groupby('client_id', as_index=False, sort=False).agg(features1)
    df.columns = ['_'.join(col).rstrip('_') for col in df.columns.values]
    df = df.set_index('client_id')
    
    
    groupby_client = payments.groupby('client_id')
    
    fts = groupby_client.agg({'amount':['mean', 'std', 'min', 'max', 'sum']})
    fts[[f'flag_{i}_count' for i in range(12)]] = groupby_client[[f'flag_{i}' for i in range(12)]].sum()
    
    is_outgoing_counts = groupby_client['is_outgoing'].apply(lambda x: percentage_outgoing(x))
    fts['0% outgoing'] = is_outgoing_counts
    
    fts = fts.merge(df, left_index = True, right_index=True)
    
#     fts['flags_sum'] = groupby_client.apply(lambda x: sum(x.loc[:, 'flag_0':'flag_11'].sum()))

#     most_channel = groupby_client.apply(lambda x: x['channel'].value_counts().index[0])
#     fts['most channel'] = most_channel

    return fts

In [None]:
features = generate_features(payments)
features

In [None]:
payments

In [None]:
features.columns

In [None]:
features.to_csv('features_data.csv')

In [None]:
import random
SEED = 0xCAFEC0DE

random.seed(SEED)
np.random.seed(SEED)

In [None]:
features.shape

In [None]:
# target = target.loc[features.index]

In [None]:
def stratified_split_cached(X, y, split_idx_file):
    if os.path.isfile(split_idx_file):
        with open(split_idx_file, 'r') as f:
            split_json = json.load(f)
        train_idx, val_idx = split_json['train'], split_json['val']
    else:
        y_shuffle = sku.shuffle(y, random_state=SEED)
        train_idx, _, val_idx, _ = iterative_train_test_split(np.expand_dims(y_shuffle.index, 1), np.array(y_shuffle), test_size=0.15)
        train_idx, val_idx = train_idx.squeeze(1), val_idx.squeeze(1)
#         with open(split_idx_file, 'w') as f:
#             json.dump({'train': list(train_idx), 'val': list(val_idx)}, f)
    return X.loc[train_idx], y.loc[train_idx], X.loc[val_idx], y.loc[val_idx]

In [None]:
X_train, y_train, X_val, y_val = stratified_split_cached(features, target, 'split_cache.json')

In [None]:
len(X_train), len(y_train), len(X_val), len(y_val)

In [None]:
X_train['0% outgoing']

In [None]:
y_train

In [None]:
X_train = pd.read_csv('/kaggle/input/train-data-100/X_train_final_100.csv').iloc[:,:-3500]
y_train = pd.read_csv('/kaggle/input/train-data-100/y_train_final_100.csv')

In [None]:
X_train = X_train.set_index('client_id')

In [None]:
y_train

In [None]:
y_train = y_train.set_index('client_id')

In [None]:
X_train

In [None]:
X_train_2, y_train_2, X_val_2, y_val_2 = stratified_split_cached(X_train, y_train, 'split_cache3.json')

In [None]:
X_train_2

In [None]:
X_train_bf = pd.read_csv('/kaggle/input/train-data-100/X_train_final_2.csv').iloc[:,:-300]
y_train_bf = pd.read_csv('/kaggle/input/train-data-100/y_train_final_2.csv')

In [None]:
X_train_bf = X_train_bf.set_index('client_id')

In [None]:
y_train_bf = y_train_bf.set_index('client_id')

In [None]:
X_train_bf

In [None]:
X_train_bf_2, y_train_bf_2, X_val_bf_2, y_val_bf_2 = stratified_split_cached(X_train_bf, y_train_bf, 'split_cache4.json')

In [None]:
X_train_bf_2

## Modelling

### Cat Boost

In [None]:
import catboost as cb
import optuna
from optuna.samplers import TPESampler

In [None]:
from sklearn.metrics import mean_absolute_error as mae
def objective(trial):
    params = {
        "random_state":trial.suggest_categorical("random_state", [2022]),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.0001, 0.3),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        "n_estimators": 1000,
        "max_depth":trial.suggest_int("max_depth", 4, 16),
        'random_strength' :trial.suggest_int('random_strength', 0, 100),
        "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'task_type': trial.suggest_categorical('task_type', ['GPU']),
        'loss_function': trial.suggest_categorical('loss_function', ['MAE']),
        'eval_metric': trial.suggest_categorical('eval_metric', ['MAE'])
    }

    model = CatBoostRegressor(**params)
    X_train_tmp, X_valid_tmp, y_train_tmp, y_valid_tmp = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(
        X_train_tmp, y_train_tmp,
        eval_set=[(X_valid_tmp, y_valid_tmp)],
        early_stopping_rounds=35, verbose=0
    )
        
    y_train_pred = model.predict(X_train_tmp)
    y_valid_pred = model.predict(X_valid_tmp)
    train_mae = mae(y_train_tmp, y_train_pred)
    valid_mae = mae(y_valid_tmp, y_valid_pred)
    
    print(f'MAE of Train: {train_mae}')
    print(f'MAE of Validation: {valid_mae}')
    
    return valid_mae

allow_optimize = 0

In [None]:
TRIALS = 100
TIMEOUT = 3600

if allow_optimize:
    sampler = TPESampler(seed=42)

    study = optuna.create_study(
        study_name = 'cat_parameter_opt',
        direction = 'minimize',
        sampler = sampler,
    )
    study.optimize(objective, n_trials=TRIALS)
    print("Best Score:",study.best_value)
    print("Best trial",study.best_trial.params)
    
    best_params = study.best_params
    model_tmp = CatBoostRegressor(**best_params, n_estimators=1000, verbose=100).fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=35)

In [None]:
def make_pool(X, y=None):
    return cb.Pool(X, y)

In [None]:
models_cb = []
for i in range(35):
    print('Fitting model', i)
    
    model_cb = cb.CatBoostClassifier(iterations=10000, loss_function='Logloss', random_seed=SEED,
                    task_type="GPU", max_depth=6)
    
#     pool_train, pool_val = make_pool(X_train, y_train[f'type_{i}']), make_pool(X_val, y_val[f'type_{i}'])
    pool_train = make_pool(X_train, y_train[f'type_{i}'])
    
    model_cb.fit(pool_train, plot=False, verbose=2000)
    
    models_cb.append(model_cb)

In [None]:
def predict(X, model_zoo):
    preds = [model_cb.predict(make_pool(X)) for i, model_cb in enumerate(model_zoo)]
    preds = pd.DataFrame(np.array(preds).transpose(1, 0), index=X.index, columns=[f'type_{i}' for i in range(35)]).astype(int)
    return preds

In [None]:
# preds = predict(X_val, models)
# print(fbeta_score(y_val, preds, beta=0.5, average='micro', zero_division=0))

In [None]:
preds = predict(X_train, models_cb)
print(fbeta_score(y_train, preds, beta=0.5, average='micro', zero_division=0))

In [None]:
prediction_cb = predict(features_test, models_cb).astype(int)

In [None]:
prediction_cb = prediction_cb.reset_index()

In [None]:
prediction_cb.to_csv('submission_2000_cb.csv', index=False)

### XGBoost

In [None]:
models = []
for i in range(35):
    model = XGBClassifier(tree_method='gpu_hist', max_depth=5, predictor = 'gpu_predictor', n_estimators=2000)
    model.fit(X_train_bf_2, y_train_bf_2[f'type_{i}'], eval_set=[(X_val_bf_2, y_val_bf_2[f'type_{i}'])], early_stopping_rounds=10,verbose=250)
    
    models.append(model)

In [None]:
results_yi = []
score_yi = []

for i in range(35):
    prediction_yi = models[i].predict(X_val_bf_2)
    results_yi.append(prediction_yi)
    score_yi.append(fbeta_score(y_val_bf_2[f'type_{i}'], prediction_yi, beta=0.5, average='micro', zero_division=0))

In [None]:
classes = []
for i in range(35):
    classes.append(f'type_{i}')

In [None]:
prediction = pd.DataFrame(np.array(results_yi).T, columns=classes, index=y_val_bf_2.index)

In [None]:
prediction

In [None]:
print(fbeta_score(y_val_bf_2, prediction, beta=0.5, average='micro', zero_division=0))

In [None]:
np.array(score_yi)

In [None]:
model = XGBClassifier(max_depth=9, n_estimators=7000, tree_method='gpu_hist', predictor = 'gpu_predictor')

In [None]:
model.fit(X_train_bf, y_train_bf)

In [None]:
prediction = model.predict(X_val_2)

In [None]:
prediction

In [None]:
print(fbeta_score(y_val_2, prediction, beta=0.5, average='micro', zero_division=0))

In [None]:
print(classification_report(y_val_2, prediction))

In [None]:
model.fit(X_train, y_train, verbose=200)

In [None]:
feature_result = pd.DataFrame(model.feature_importances_, X_train.columns)
feature_result.columns = ['result']
feature_result = feature_result.sort_values(by='result', ascending=False)
feature_result = feature_result[feature_result['result']>0.003]

In [None]:
plt.subplots(figsize=(10, 8), dpi=100)
plt.barh(feature_result.index, feature_result['result'])

plt.title('Значимость признаков', fontsize=16)
plt.xlabel('Значение', fontsize=14)
plt.ylabel('Признаки', fontsize=14)
plt.show()

In [None]:
X_train[feature_result.index]

In [None]:
model = XGBClassifier(eval_metric=fbeta_score, max_depth=8, n_estimators=5000, tree_method='gpu_hist', predictor='gpu_predictor')
model.fit(X_train[feature_result.index], y_train)

In [None]:
prediction = model.predict(X_val[feature_result.index])
print(fbeta_score(y_val, prediction, beta=0.5, average='micro', zero_division=0))

In [None]:
from sklearn.model_selection import GridSearchCV

allow_optimize = 1
if allow_optimize:
    param_grid={'max_depth': [4,5,6],
            'n_estimators': [300, 500, 700, 800, 1000],
            'min_child_weight' : [1,2,3,4],
            'gpu_id' : [0]
        }

    classifier = XGBClassifier(tree_method = 'gpu_hist', predictor = 'gpu_predictor')
    CV_classifier = GridSearchCV(classifier, param_grid, cv=2, scoring="accuracy", n_jobs= -1, return_train_score = True, verbose = 3)
    CV_classifier.fit(X_train, y_train)
    
    print("The best hyperparameters are : ","\n")
    print(CV_classifier.best_params_)

In [None]:
if allow_optimize: 
    CV_regressor = CV_regressor.best_estimator_
else:
    CV_regressor = XGBRegressor(tree_method = 'gpu_hist', predictor = 'gpu_predictor', gpu_id = 0, max_depth = 4, n_estimators = 100)
CV_regressor.fit(X, y)

### Chain Classifier

In [None]:
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.ensemble import LabelSpacePartitioningClassifier, MajorityVotingClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.multioutput import ClassifierChain

chain_cls = ClassifierChain(XGBClassifier(eval_metric=fbeta_score, n_estimators=500, tree_method='gpu_hist'))
# chain_cls = ClassifierChain(cb.CatBoostClassifier(iterations=400, loss_function='Logloss', random_seed=SEED, verbose=200), random_state=5, cv=5)

In [None]:
chain_cls.fit(X_train, y_train)

In [None]:
chain_prediction = chain_cls.predict(X_val)

In [None]:
print(fbeta_score(y_val, chain_prediction, beta=0.5, average='micro', zero_division=0))

### Voting Classifier on XGBost and Catboost

In [None]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[('xgboost', model), ('catboost', clf2)], voting='hard', n_jobs=-1, voting='hard')

## Submission

In [None]:
payments_test = pd.read_csv(PAYMENTS_TEST_PATH, dtype=payments_dtypes)
payments_test

In [None]:
contractors_test = payments_test['contractor_id']

In [None]:
contractor_ids_test = payments_test['contractor_id'].value_counts().head(50).index
payments_test.loc[~payments_test['contractor_id'].isin(contractor_ids_test), 'contractor_id']=0
payments_test = pd.get_dummies(payments_test, columns=['contractor_id'])
payments_test = pd.get_dummies(payments_test, columns=['channel'])

In [None]:
payments_test['contractor_id'] = contractors_test

In [None]:
dict_features_test = {}
for i in contractor_ids_test.values:
    dict_features_test[f'contractor_id_{i}'] = 'sum'

In [None]:
features2 = {'channel_app': 'sum', 'channel_atm': 'sum', 'channel_pos': 'sum','channel_web': 'sum',\
                                        'dt_hour': ['skew', 'mean', 'std', quantile_25, quantile_50, quantile_75], 'dt_day': ['min', 'max', 'skew', 'mean', 'std', quantile_25, quantile_50, quantile_75, 'nunique'], 'is_outgoing': ['sum', 'count'], \
                                        'contractor_id':'nunique'}

In [None]:
features2.update(dict_features_test)

In [None]:
df = payments_test.groupby('client_id', as_index=False).agg(features2)

df.columns = ['_'.join(col).rstrip('_') for col in df_test.columns.values]

In [None]:
df = df.set_index('client_id')

In [None]:
payments_test.columns

In [None]:
features_test = generate_features(payments_test)
features_test

In [None]:
print('done')

In [None]:
del X_train

In [None]:
del Y_train

In [None]:
features_test = pd.read_csv('/kaggle/input/submissionfeatures/Submission_Features_Dataframe.csv').iloc[:,:-2000]


In [None]:
features_test = pd.read_csv('/kaggle/input/submissionfeatures/Submission_Features_Dataframe2.csv').iloc[:,:-300]

In [None]:
features_test

In [None]:
features_test = features_test.set_index('client_id')

In [None]:
features_test

In [None]:
prediction_test = model.predict(features_test).astype(int)
# prediction_test = chain_cls.predict(features_test).astype(int)
prediction_test

In [None]:
prediction_test

In [None]:
classes = []
for i in range(35):
    classes.append(f'type_{i}')

In [None]:
submission = pd.DataFrame(prediction_test, columns = classes)
submission.head()

In [None]:
submission['client_id'] = features_test.index

In [None]:
classes.insert(0, 'client_id')

In [None]:
submission = submission[classes]

In [None]:
submission

In [None]:
submission.to_csv('submission_500_bf.csv', index=False)