In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from lightgbm.basic import Booster
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
HORIZON = 1

ALL_COLUMNS = [
            'slctn_nmbr',
            'client_id',
            'npo_account_id',
            'npo_accnts_nmbr',
            'pmnts_type',
            'year',
            'quarter',
            'gender',
            'age',
            'clnt_cprtn_time_d',
            'actv_prd_d',
            'lst_pmnt_rcnc_d',
            'balance',
            'oprtn_sum_per_qrtr',
            'oprtn_sum_per_year',
            'frst_pmnt_date',
            'lst_pmnt_date_per_qrtr',
            'frst_pmnt',
            'lst_pmnt',
            'pmnts_sum',
            'pmnts_nmbr',
            'pmnts_sum_per_qrtr',
            'pmnts_sum_per_year',
            'pmnts_nmbr_per_qrtr',
            'pmnts_nmbr_per_year',
            'incm_sum',
            'incm_per_qrtr',
            'incm_per_year',
            'mgd_accum_period',
            'mgd_payment_period',
            'phone_number',
            'email',
            'lk',
            'assignee_npo',
            'assignee_ops',
            'postal_code',
            'region',
            'citizen',
            'fact_addrss',
            'appl_mrkr',
            'evry_qrtr_pmnt',
            'postal_code', # nulls
        ]

ADDITIONAL_COLUMNS = [
    # Sazanakov`s cb rate
    'rate',
    # Andrey`s CAPM with nulls
    'RF_RGBI_Index',
    'Wide_Market',
    'Big',
    'Small',
    'Growth_L',
    'Value_H',
    'Low_Return',
    'High_Return',
    'Low_Liq',
    'High_Liq',
    'Low_DY',
    'High_DY',
    'No_Div',
    'Was_Div',
    'Private',
    'SOE',
    'Low_PE',
    'High_PE',
    'Wide_Market_TR',
    'Big_TR',
    'Small_TR',
    'Growth_L_TR',
    'Value_H_TR',
    'Low_Return_TR',
    'High_Return_TR',
    'Low_Liq_TR',
    'High_Liq_TR',
    'Low_DY_TR',
    'High_DY_TR',
    'No_Div_TR',
    'Was_Div_TR',
    'Private_TR',
    'SOE_TR',
    'Low_PE_TR',
    'High_PE_TR',
    # Sazanakov`s IMOEX
    'price',
    'open',
    'max',
    'min',
    'diff_percent',
]

TARGER_COLUMN = 'churn'

NOT_TRAIN_COLUMNS = [
    'client_id',
    'npo_account_id',
    'frst_pmnt_date', # nulls
    'lst_pmnt_date_per_qrtr',  # nulls
]

CATEGORY_COLUMNS = [
    'quarter',
    'region', # nulls
]

LEAD_TARGET_COLUMN = f"{TARGER_COLUMN}_lead_{HORIZON}"

TRAIN_COLUMNS = list(set(ALL_COLUMNS) - set(NOT_TRAIN_COLUMNS))
TRAIN_COLUMNS.extend(ADDITIONAL_COLUMNS)

TRAIN_COLUMNS_WITHOUT_CATEGORY = list(set(TRAIN_COLUMNS) - set(CATEGORY_COLUMNS))

In [3]:
data = pd.read_csv('data/train.csv', sep=',')
data = data.drop_duplicates()

data = data.sort_values(by=['npo_account_id', 'quarter']).reset_index(drop=True)
data[f'{TARGER_COLUMN}_lead_{HORIZON}'] = data.groupby('npo_account_id')['churn'].shift(-1)

In [4]:
cb_rate = pd.read_csv('/home/rsharafetdinov/repos/research/ufo_npo/features/rates_q.csv')

print('before', data.shape[0])

data = pd.merge(
    data,
    cb_rate,
    on='quarter',
    how='left'
)

print('after', data.shape[0])

before 3587971
after 3587971


In [5]:
capm = pd.read_csv('/home/rsharafetdinov/repos/research/ufo_npo/features/capm_factors.csv', sep=';')
capm = capm.drop('.', axis=1)
capm = capm.rename(columns={'Date': 'quarter'})

for column in capm.columns:
    if column == 'quarter':
        continue

    capm[column] = capm[column].str.replace(',', '.').astype(float)

print('before', data.shape[0])

data = pd.merge(
    data,
    capm,
    on='quarter',
    how='left'
)

print('after', data.shape[0])

before 3587971
after 3587971


In [6]:
imoex = pd.read_csv('/home/rsharafetdinov/repos/research/ufo_npo/features/imoex_quart.csv', sep=',')

print('before', data.shape[0])

data = pd.merge(
    data,
    imoex,
    on='quarter',
    how='left'
)

print('after', data.shape[0])

before 3587971
after 3587971


In [7]:
for column in ADDITIONAL_COLUMNS:
    data[column] = data[column].fillna(0)

data['postal_code'] = data['postal_code'].fillna(0)
data['region'] = data['region'].fillna('Not Info')

In [8]:
data = data.dropna(subset=LEAD_TARGET_COLUMN)

In [9]:
X = data[TRAIN_COLUMNS]
y = data[LEAD_TARGET_COLUMN]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.24,
                           depth=8,
                           loss_function='Logloss',
                           eval_metric='TotalF1:average=Macro',
                           random_seed=42,
                           cat_features=CATEGORY_COLUMNS)

In [11]:
model.fit(X, y,
          use_best_model=True,
          cat_features=CATEGORY_COLUMNS,
          plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.6703389	total: 767ms	remaining: 12m 46s
1:	learn: 0.6581465	total: 1.46s	remaining: 12m 8s
2:	learn: 0.6759997	total: 2.14s	remaining: 11m 52s
3:	learn: 0.6928029	total: 2.78s	remaining: 11m 33s
4:	learn: 0.7085900	total: 3.43s	remaining: 11m 23s
5:	learn: 0.7162804	total: 4.12s	remaining: 11m 22s
6:	learn: 0.7191322	total: 4.76s	remaining: 11m 15s
7:	learn: 0.7212357	total: 5.36s	remaining: 11m 4s
8:	learn: 0.7236341	total: 5.98s	remaining: 10m 58s
9:	learn: 0.7239302	total: 6.64s	remaining: 10m 57s
10:	learn: 0.7297626	total: 7.32s	remaining: 10m 58s
11:	learn: 0.7308252	total: 7.89s	remaining: 10m 50s
12:	learn: 0.7339043	total: 8.5s	remaining: 10m 45s
13:	learn: 0.7340955	total: 9.13s	remaining: 10m 43s
14:	learn: 0.7333016	total: 9.76s	remaining: 10m 40s
15:	learn: 0.7362763	total: 10.4s	remaining: 10m 41s
16:	learn: 0.7363667	total: 11s	remaining: 10m 38s
17:	learn: 0.7366314	total: 11.7s	remaining: 10m 37s
18:	learn: 0.7409758	total: 12.4s	remaining: 10m 38s
19:	lear

<catboost.core.CatBoostClassifier at 0x7fbb6ce3f730>

In [12]:
# model.save_model('final_model')

In [13]:
test = pd.read_csv('test.csv')

In [15]:
cb_rate = pd.read_csv('/home/rsharafetdinov/repos/research/ufo_npo/features/rates_q.csv')

print('before', test.shape[0])

test = pd.merge(
    test,
    cb_rate,
    on='quarter',
    how='left'
)

print('after', test.shape[0])

before 633434
after 633434


In [16]:
capm = pd.read_csv('/home/rsharafetdinov/repos/research/ufo_npo/features/capm_factors.csv', sep=';')
capm = capm.drop('.', axis=1)
capm = capm.rename(columns={'Date': 'quarter'})

for column in capm.columns:
    if column == 'quarter':
        continue

    capm[column] = capm[column].str.replace(',', '.').astype(float)

print('before', test.shape[0])

test = pd.merge(
    test,
    capm,
    on='quarter',
    how='left'
)

print('after', test.shape[0])

before 633434
after 633434


In [17]:
imoex = pd.read_csv('/home/rsharafetdinov/repos/research/ufo_npo/features/imoex_quart.csv', sep=',')

print('before', test.shape[0])

test = pd.merge(
    test,
    imoex,
    on='quarter',
    how='left'
)

print('after', test.shape[0])

before 633434
after 633434


In [18]:
for column in ADDITIONAL_COLUMNS:
    test[column] = test[column].fillna(0)

test['postal_code'] = test['postal_code'].fillna(0)
test['region'] = test['region'].fillna('Not Info')

In [23]:
model.predict(test[TRAIN_COLUMNS])

array([0., 0., 0., ..., 0., 0., 0.])

In [26]:
sample_to_predict = pd.DataFrame()

In [27]:
sample_to_predict['npo_account_id'] = test['npo_account_id']

In [28]:
sample_to_predict['quarter'] = test['quarter']

In [30]:
sample_to_predict['churn'] = model.predict(test[TRAIN_COLUMNS])

In [32]:
sample_to_predict['churn']  = sample_to_predict['churn'].astype(int)

In [35]:
sample_to_predict.to_csv('sample_predict.csv')

In [24]:
sample = pd.read_csv('sample_submission.csv')

In [25]:
sample

Unnamed: 0,npo_account_id,quarter,churn
0,0x0000132B2D126446B3E105530BA834B9,2009Q2,0
1,0x0000132B2D126446B3E105530BA834B9,2012Q1,0
2,0x0000132B2D126446B3E105530BA834B9,2017Q1,0
3,0x0000132B2D126446B3E105530BA834B9,2017Q2,0
4,0x0000132B2D126446B3E105530BA834B9,2020Q2,0
...,...,...,...
633429,0xFFFF57AF7B56DE4891B4CD4E78A5B892,2020Q4,0
633430,0xFFFF5DA61B60574194D4B977023B15B9,2009Q4,0
633431,0xFFFF5DA61B60574194D4B977023B15B9,2017Q4,0
633432,0xFFFFF59735AAF14E9DABA46C84CAD40B,2017Q3,0


In [None]:
predictions = model.predict(X_test)
print(f'F1-score macro: ', f1_score(y_test, predictions, average='macro'))
print(classification_report(y_test, predictions))

In [None]:
# train_split_time = '2021Q1'
# val_split_time = '2021Q2'

In [None]:
train_split_time = '2021Q3'
val_split_time = '2021Q4'

In [None]:
train = data.loc[data['quarter'] <= train_split_time]
val = data.loc[(data['quarter'] == val_split_time)]

In [None]:
# subset = data.loc[(data['quarter'] == '2021Q4')]
# subset[LEAD_TARGET_COLUMN].value_counts(dropna=False)
# test = data.loc[data['npo_account_id'].isin(subset[subset[LEAD_TARGET_COLUMN].isnull()]['npo_account_id'].tolist())].groupby('npo_account_id')['quarter'].max().reset_index()

In [None]:
train['postal_code'] = train['postal_code'].fillna(train['postal_code'].median())
train['region'] = train['region'].fillna(train['region'].mode().loc[0])

In [None]:
val['postal_code'] = val['postal_code'].fillna(val['postal_code'].median())
val['region'] = val['region'].fillna(val['region'].mode().loc[0])

In [None]:
def filter_train_data(data: pd.DataFrame, target_col: str) -> pd.DataFrame:
    return data[
        (~data[target_col].isnull())
    ].reset_index(drop=True)

In [None]:
def train_clf_model(
    data: pd.DataFrame,
    target_col: str,
    features,
    cat_features,
    train_params,
    num_boost_round: int,
) -> Booster:
    data = lgb.Dataset(
        data=data[features],
        label=data[target_col].values,
        # categorical_feature=cat_features,
        free_raw_data=True,
    )

    # Train model
    return lgb.train(
        train_params,
        data,
        # categorical_feature=cat_features,
        num_boost_round=num_boost_round,
    )

In [None]:
train['quarter'] = train['quarter'].astype('category')
val['quarter'] = val['quarter'].astype('category')

train['region'] = train['region'].astype('category')
val['region'] = val['region'].astype('category')

In [None]:
model = CatBoostClassifier(n_estimators=500, 
                           learning_rate=0.12,
                           eval_metric='TotalF1:average=Macro')

In [None]:
model.save_model('model')

In [None]:
model.fit(X_train, y_train, cat_features=CATEGORY_COLUMNS,
          eval_set=(X_test, y_test),
          plot=True)

In [None]:
predictions = np.where(model.predict(X_test) > 0.5, 1, 0)
print(f'F1-score macro: ', f1_score(y_test, predictions, average='macro'))
print(classification_report(y_test, predictions))

In [None]:
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 12))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance')


In [None]:
train = filter_train_data(train, LEAD_TARGET_COLUMN)

In [None]:
val[LEAD_TARGET_COLUMN].value_counts(dropna=False)

In [None]:
val = val.dropna(subset=LEAD_TARGET_COLUMN).reset_index(drop=True)

In [None]:
val[LEAD_TARGET_COLUMN].value_counts(dropna=False)

In [None]:
model.fit(train[TRAIN_COLUMNS], train[LEAD_TARGET_COLUMN], cat_features=CATEGORY_COLUMNS,
          eval_set=(val[TRAIN_COLUMNS], val[LEAD_TARGET_COLUMN]),
          plot=True)

In [None]:
predictions = np.where(model.predict(val[TRAIN_COLUMNS]) > 0.5, 1, 0)
print(f'F1-score macro: ', f1_score(val[LEAD_TARGET_COLUMN], predictions, average='macro'))
print(classification_report(val[LEAD_TARGET_COLUMN], predictions))

In [None]:
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 12))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(val[TRAIN_COLUMNS].columns)[sorted_idx])
plt.title('Feature Importance')


In [None]:
NUM_BOOST_ROUND = 1500

REG_TRAIN_PARAMS = {
    "objective": "binary",
    "verbosity": -1,
    "random_seed": 42,
    "num_threads": 16,
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
    "feature_fraction": 0.9,
    'is_unbalance': True,
    'learning_rate': 0.025,
}

In [None]:
horizon_train_data = filter_train_data(train, LEAD_TARGET_COLUMN)
model = train_clf_model(
        horizon_train_data,
        LEAD_TARGET_COLUMN,
        TRAIN_COLUMNS,
        CATEGORY_COLUMNS,
        REG_TRAIN_PARAMS,
        NUM_BOOST_ROUND,
        )

In [None]:
val[LEAD_TARGET_COLUMN].isnull().sum()

In [None]:
val[LEAD_TARGET_COLUMN].value_counts(dropna=False)

In [None]:
val_clear = val.copy()
val_clear = val_clear.dropna(subset=LEAD_TARGET_COLUMN).reset_index(drop=True)

In [None]:
val_clear[LEAD_TARGET_COLUMN].isnull().sum()

In [None]:
val_clear[LEAD_TARGET_COLUMN].value_counts()

In [None]:
val_clear.shape[0] + val[LEAD_TARGET_COLUMN].isnull().sum() == val.shape[0]

In [None]:
# model.save_model('model.txt')

In [None]:
predictions = np.where(model.predict(val_clear[TRAIN_COLUMNS]) > 0.5, 1, 0)
print(f'F1-score macro: ', f1_score(val_clear[LEAD_TARGET_COLUMN], predictions, average=None))
print(classification_report(val_clear[LEAD_TARGET_COLUMN], predictions))

In [None]:
# # Plot feature importance using Gain
# lgb.plot_importance(model, importance_type="gain", figsize=(7,6), title="LightGBM Feature Importance (Gain)")
# plt.show()
