In [17]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [10]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [11]:
def process_data_for_catboost(train, test):
    purpose = [x for x in train.columns if x.startswith('purpose_')]
    subgrade = [x for x in train.columns if x.startswith('sub_grade_')]
    home_ownership = [x for x in train.columns if x.startswith('home_ownership_')]

    train['purpose'] = train[purpose].idxmax(axis=1)
    train.drop(columns=purpose, axis=1, inplace=True)
    train['subgrade'] = train[subgrade].idxmax(axis=1)
    train.drop(columns=subgrade, axis=1, inplace=True)
    train['home_ownership'] = train[home_ownership].idxmax(axis=1)
    train.drop(columns=home_ownership, axis=1, inplace=True)

    test['purpose'] = test[purpose].idxmax(axis=1)
    test.drop(columns=purpose, axis=1, inplace=True)
    test['subgrade'] = test[subgrade].idxmax(axis=1)
    test.drop(columns=subgrade, axis=1, inplace=True)
    test['home_ownership'] = test[home_ownership].idxmax(axis=1)
    test.drop(columns=home_ownership, axis=1, inplace=True)
    
    return train, test

In [21]:
catboost_train, catboost_test = process_data_for_catboost(train.copy(), test.copy())
logreg_train, logreg_test = train.copy(), test.copy()

In [22]:
def get_features(df: pd.DataFrame, model_type='catboost') -> pd.DataFrame:
    
    # % полная сумма к уплате
    df['psk'] = df['installment']*df['term']

    # % переплаты
    df['percent'] = (df['psk'] - df['funded_amnt'])/df['funded_amnt']

    # годовой процент
    df['percent_per_year'] = (df['percent']/df['term']*1200).round(decimals=1)

    # ПДН без учета текущего займа
    df['residual_dti'] = df['dti'] - (df['installment']*12/df['annual_inc'])*100
    # отношение суммы займа к годовому доходу
    df['loan_income_ratio'] = df['funded_amnt']/df['annual_inc']
    
    # ПНД без учета займа
    df['residual_dti'] = df['dti'] - (df['installment']*12/df['annual_inc'])*100
    df['additional_dti'] = df['dti'] + (df['installment']*12/df['annual_inc'])*100
    
    df['dti_ratio_min'] = df['dti']/df['residual_dti']
    df['dti_ratio_max'] = df['dti']/df['additional_dti']
    
    df['is_first_loan'] = df['mths_since_recent_inq'].between(6.01, 6.99)
    df['is_not_sber_client'] = df['avg_cur_bal'].between(13841.01, 13841.02)

    if model_type == 'catboost':
        df.loc[df['is_first_loan'], 'mths_since_recent_inq'] = None
        df.loc[
            df['is_not_sber_client'], ['num_accts_ever_120_pd', 'num_tl_90g_dpd_24m', 'avg_cur_bal', 'tot_hi_cred_lim']
        ] = None
    elif model_type == 'logreg':
        df['is_first_loan'] = df['is_first_loan'].astype('int')
        df['is_not_sber_client'] = df['is_not_sber_client'].astype('int')

    return df

catboost_train = get_features(catboost_train)
catboost_test = get_features(catboost_test)

logreg_train = get_features(logreg_train, 'logreg')
logreg_test = get_features(logreg_test, 'logreg')

In [23]:
cols_to_scale = [
    'installment',
    'dti',
    'funded_amnt',
    'annual_inc',
    'avg_cur_bal',
    'tot_hi_cred_lim',
    'psk',
    'percent',
    'percent_per_year',
    'residual_dti',
    'loan_income_ratio',
    'additional_dti',
    'dti_ratio_min',
    'dti_ratio_max',
]


scaller = StandardScaler()
logreg_train[cols_to_scale] = scaller.fit_transform(logreg_train[cols_to_scale])
logreg_test[cols_to_scale] = scaller.transform(logreg_test[cols_to_scale])

In [25]:
catboost_train.to_csv('data/catboost_train.csv', index=False)
catboost_test.to_csv('data/catboost_test.csv', index=False)
logreg_train.to_csv('data/logreg_train.csv', index=False)
logreg_test.to_csv('data/logreg_test.csv', index=False)