In [1]:
# uni_D83AE5_uni (Baseline)
#  {"Logloss": 0.02929905326872508, "ROCAUC": 0.9784653910998551}
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from scipy.stats import gamma, kstest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    log_loss, 
    roc_auc_score, 
    roc_curve, 
    auc
)
import statsmodels.api as sm
import warnings

warnings.filterwarnings('ignore')  # Use this line to suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

training_data = pd.read_csv('training_data.csv')
training_data = training_data[training_data['BORROWER_ID'] != 'xNullx']
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)
data_submission_example = pd.read_csv('data_submission_example.csv')

In [2]:
training_data.fillna(0, inplace=True)
for col in training_data.columns:
    try:
        training_data[col] = pd.to_numeric(training_data[col], errors='ignore')
    except:
        continue
training_data['TARGET_EVENT_BINARY'] = np.where(training_data['TARGET_EVENT'] == 'K', 1, 0)

In [3]:
lognormal_variables = [
    'CONTRACT_CREDIT_LOSS', 'CONTRACT_DEPT_SERVICE_TO_INCOME',
    'CONTRACT_INCOME', 'CONTRACT_INSTALMENT_AMOUNT', 'CONTRACT_INSTALMENT_AMOUNT_2',
    'CONTRACT_LOAN_AMOUNT', 'CONTRACT_MARKET_VALUE', 'CONTRACT_MORTGAGE_LENDING_VALUE', 
    'CONTRACT_LGD', 'CONTRACT_INCOME'
]
date_variables = ['CONTRACT_DATE_OF_LOAN_AGREEMENT', 'CONTRACT_MATURITY_DATE']

In [4]:
training_data['TARGET_EVENT_DAY'].replace(0.0, np.nan, inplace=True)
training_data['TARGET_EVENT_DAY_JULIAN'] = pd.to_datetime(training_data['TARGET_EVENT_DAY'], origin='julian', unit='D', errors='coerce')
training_data['TARGET_EVENT_DAY_DATETIME'] = pd.to_datetime(training_data['TARGET_EVENT_DAY_JULIAN'],  errors='coerce')

training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_JULIAN'] = pd.to_datetime(training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT'], origin='julian', unit='D')
training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] = pd.to_datetime(training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_JULIAN'],)

training_data['CONTRACT_MATURITY_DATE_JULIAN'] = pd.to_datetime(training_data['CONTRACT_MATURITY_DATE'], origin='julian', unit='D')
training_data['CONTRACT_MATURITY_DATE_DATETIME'] = pd.to_datetime(training_data['CONTRACT_MATURITY_DATE_JULIAN'])

In [5]:
training_data['DAY_DIFF'] = (training_data['TARGET_EVENT_DAY_DATETIME'] - training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME']).dt.days

In [6]:
def create_binary_target_column(dataframe, column_name, event, day_diff_upper_limit):
    dataframe[column_name] = np.where(
        (dataframe['TARGET_EVENT'] == event) & 
        (dataframe['DAY_DIFF'] <= day_diff_upper_limit) & 
        (dataframe['DAY_DIFF'] >= 0), 
        1, 
        0
    )

timeframes = {
    'TARGET_EVENT_BINARY_2Y': 730,
    'TARGET_EVENT_BINARY_1Y': 365,
    'TARGET_EVENT_BINARY_6M': 365//2,
}

for column_name, days in timeframes.items():
    create_binary_target_column(training_data, column_name, 'K', days)


# target_event_binary_columns = []
# for i in range(6):
#     start_day = 273 + i * 30
#     column_header = 'TARGET_EVENT_BINARY_' + str(start_day) + 'D'
#     training_data[column_header] = np.where(
#         (training_data['TARGET_EVENT'] == 'K') & 
#         (training_data['DAY_DIFF'] <= start_day) & 
#         (training_data['DAY_DIFF'] >= 0), 
#         1, 
#         0
#     )
#     target_event_binary_columns.append(column_header)

training_data.drop('DAY_DIFF', axis=1, inplace=True)

In [7]:
print(training_data['TARGET_EVENT_BINARY'].value_counts())
print(training_data['TARGET_EVENT_BINARY_2Y'].value_counts())
print(training_data['TARGET_EVENT_BINARY_1Y'].value_counts())
print(training_data['TARGET_EVENT_BINARY_6M'].value_counts())

TARGET_EVENT_BINARY
0    1590792
1      10871
Name: count, dtype: int64
TARGET_EVENT_BINARY_2Y
0    1591751
1       9912
Name: count, dtype: int64
TARGET_EVENT_BINARY_1Y
0    1596927
1       4736
Name: count, dtype: int64
TARGET_EVENT_BINARY_6M
0    1601267
1        396
Name: count, dtype: int64


In [8]:
numeric_columns = training_data.select_dtypes(include=[np.number]).columns.tolist()
excluded_keywords = ['TARGET', 'event', 'binary', 'DATE']

X_columns = [col for col in numeric_columns if all(keyword.lower() not in col.lower() for keyword in excluded_keywords)]
y_column = 'TARGET_EVENT_BINARY_2Y' 
X_columns

['CONTRACT_CREDIT_INTERMEDIARY',
 'CONTRACT_CREDIT_LOSS',
 'CONTRACT_CURRENCY',
 'CONTRACT_DEPT_SERVICE_TO_INCOME',
 'CONTRACT_INCOME',
 'CONTRACT_INSTALMENT_AMOUNT',
 'CONTRACT_INSTALMENT_AMOUNT_2',
 'CONTRACT_INTEREST_PERIOD',
 'CONTRACT_INTEREST_RATE',
 'CONTRACT_LGD',
 'CONTRACT_LOAN_AMOUNT',
 'CONTRACT_LOAN_CONTRACT_TYPE',
 'CONTRACT_LOAN_TO_VALUE_RATIO',
 'CONTRACT_MARKET_VALUE',
 'CONTRACT_MORTGAGE_LENDING_VALUE',
 'CONTRACT_MORTGAGE_TYPE',
 'CONTRACT_REFINANCED',
 'CONTRACT_RISK_WEIGHTED_ASSETS',
 'CONTRACT_TYPE_OF_INTEREST_REPAYMENT',
 'BORROWER_BIRTH_YEAR',
 'BORROWER_CITIZENSHIP',
 'BORROWER_COUNTRY',
 'BORROWER_COUNTY',
 'BORROWER_TYPE_OF_SETTLEMENT']

In [None]:
def calculate_probabilities(data, column, time_factor):
    lambdas = -np.log(1 - data[column]) / time_factor
    probs_2y = 1 - np.exp(-2 * lambdas)
    return probs_2y
def calculate_probabilities_vec(data, time_factor):
    lambdas = -np.log(1 - data) / time_factor
    probs_2y = 1 - np.exp(-2 * lambdas)
    return probs_2y
# def adjust_probabilities(data, factor):
#     odds = data / (1 - data)  # Convert probabilities to odds
#     new_odds = odds * factor  # Scale the odds by the factor

#     # Convert the new odds back to probabilities
#     new_probabilities = new_odds / (1 + new_odds)
#     return new_probabilities


In [18]:
def combined_probability(s):
    if len(s) == 2:
        p_a, p_b = s.values
        return p_a + p_b - p_a * p_b #- (-3.2357827075016176e-05)
    else:
        return 1 - np.prod(1 - s.values)

def create_submission_file(df_preds, target, example, filename='submission.csv', testing=False):
    # Filter the data to only include BORROWER_IDs that are in the submission example
    filtered_training_data = df_preds[df_preds['BORROWER_ID'].isin(example['BORROWER_ID'])]

    # Print warning if the row count is off
    if not testing and len(filtered_training_data) != 1564601:
        print('WARNING: The filtered data does not have the correct number of rows. Make sure you are not using the training data for submission.')
        # raise ValueError('WARNING: The submission file does not have the correct number of rows. Make sure you are not using the training data for submission.')
    # Group by BORROWER_ID and calculate the combined probability

    #######################x########################
    #CUTTING TAILS DID NOT SEEM TO WORK
    #######################x########################
    # filtered_training_data = cut_exponential_tails(filtered_training_data, target)
    grouped_data = filtered_training_data.groupby('BORROWER_ID')[target].apply(combined_probability).reset_index()

    # Create the submission DataFrame
    df_submission = pd.DataFrame()
    df_submission['BORROWER_ID'] = grouped_data['BORROWER_ID']
    df_submission['PRED'] = grouped_data[target]
    print('Before centering:')
    print(df_submission['PRED'].max(), df_submission['PRED'].min(), df_submission['PRED'].mean())

    print('Centering probabilities...')
    # Center the probabilities around 1.48%
    desired_mean = 0.0148  # 1.48% as a decimal
    # while (df_submission['PRED'].max() > 1 or df_submission['PRED'].min() < 0 or abs(df_submission['PRED'].mean() -0.0148) > 0.0005):
    #     # print(df_submission['PRED'].max(), df_submission['PRED'].min(), df_submission['PRED'].mean())
    #     df_submission['PRED'] = df_submission['PRED'].clip(lower=0, upper=1)
    #     # print(df_submission['PRED'].max(), df_submission['PRED'].min(), df_submission['PRED'].mean())

    #     current_mean = df_submission['PRED'].mean()
    #     adjustment_factor = desired_mean  - current_mean
    #     df_submission['PRED'] += adjustment_factor
    initial_guess = 2
    probas_unscaled = df_submission['PRED'].values
    new_proba = probas_unscaled.copy()
    # while abs(new_proba.mean() - desired_mean) > 0.00001:
        
    #     new_proba = calculate_probabilities_vec(probas_unscaled, initial_guess)
    #     error = new_proba.mean() - desired_mean
    #     if error > 0:
    #         initial_guess += 0.001
    #     else:
    #         initial_guess -= 0.001
    #     print(error, initial_guess)
    # df_submission['PRED'] = new_proba
        
        
    
    print(df_submission['PRED'].max(), df_submission['PRED'].min(), df_submission['PRED'].mean())
    # Save the submission file
    if  not testing and filename is not None:
        df_submission.to_csv(filename, index=False)
    print(f'Saved file: {filename}')
    # if abs(df_submission['PRED'].mean() -0.0148) > 0.0005:
    #    raise ValueError('WARNING: mean is bad')
        
    # Print warning if the row count is off
    if not testing and len(df_submission) != 1117674:
        print('WARNING: The submission file does not have the correct number of rows. Make sure you are not using the training data for submission.')
        # raise ValueError('WARNING: The submission file does not have the correct number of rows. Make sure you are not using the training data for submission.')
        
    return df_submission


In [12]:
contracts_2016 = training_data[training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] < '2016-01-01']

In [13]:
training_data[training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] < '2016-01-01']['TARGET_EVENT_BINARY'].mean()

0.011345816067598232

In [14]:
defaults_2016 = contracts_2016[contracts_2016['TARGET_EVENT_BINARY'] == 1]

defaults_2015_2017 = defaults_2016[defaults_2016['TARGET_EVENT_DAY_DATETIME'] < '2017-01-01']
defaults_2016_2018 = defaults_2016[(defaults_2016['TARGET_EVENT_DAY_DATETIME'] < '2018-01-01') & (defaults_2016['TARGET_EVENT_DAY_DATETIME'] > '2016-01-01')]

In [15]:
print(f"Defaulted in their 1st and 2nd year: {len(defaults_2015_2017)/len(contracts_2016)}")
print(f"Defaulted in their 2nd and 3rd year: {len(defaults_2016_2018)/len(contracts_2016)}")

Defaulted in their 1st and 2nd year: 0.006340839129746867
Defaulted in their 2nd and 3rd year: 0.0106268137784071


In [None]:
# 2016 h1 0.007: {"Logloss": 0.07576952484866101, "ROCAUC": 0.5809069048130779} | 0.0148;
# 2016 h1 0.004: {"Logloss": 0.0757328275890477, "ROCAUC": 0.5808103518691253} | 0.0148;
# 2016 h1 0.0055:{"Logloss": 0.07572683759226627, "ROCAUC": 0.5809141837708658} | 0.0148;
# 2016 h1 0.005: {"Logloss": 0.07572207835633597, "ROCAUC": 0.5809090905637126} | 0.0148; MEAN FOR 2016 H1 : 0.005

# 2016 h2 0.006: {"Logloss": 0.07467445419709652, "ROCAUC": 0.6075441905087731}
# 2016 h2 0.007: {"Logloss": 0.07472849170280915, "ROCAUC": 0.6075493119741427}
# 2016 h2 0.0065:{"Logloss": 0.07469789814688527, "ROCAUC": 0.6075423785066949}
# 2016 h2 0.0055:{"Logloss": 0.074659309734335, "ROCAUC": 0.607561030499619} FURTHER MEAN TESTING NEEDED

# 2017 h1 0.0080:{"Logloss": 0.07441192921590624, "ROCAUC": 0.6164629471570343}
# 2017 h1 0.0070:{"Logloss": 0.07442699304501614, "ROCAUC": 0.6167566473624233}
# 2017 h1 0.0085:{"Logloss": 0.07441147699155433, "ROCAUC": 0.6164700930096506}

# 2017 h2 0.0095:{"Logloss": 0.07421256516493982, "ROCAUC": 0.6261174064771263}
# 2017 h2 0.0105:{"Logloss": 0.07420875574104989, "ROCAUC": 0.6251744099377549}

In [61]:

new_proba = np.zeros(len(training_data))
mask_2016_h1 =  (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] > '2015-01-01') & (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] < '2015-06-01') & (training_data['TARGET_EVENT_BINARY'] != 1)
mask_2016_h2 =  (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] > '2015-06-01') & (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] < '2016-01-01') & (training_data['TARGET_EVENT_BINARY'] != 1)
mask_2017_h1 = (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] > '2016-01-01') & (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] < '2016-06-01')& (training_data['TARGET_EVENT_BINARY'] != 1)
mask_2017_h2 = (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] > '2016-06-01') & (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] < '2017-01-01')& (training_data['TARGET_EVENT_BINARY'] != 1)
mask_2018_h1 =  (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] > '2017-01-01') & (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] < '2017-06-01') & (training_data['TARGET_EVENT_BINARY'] != 1)
mask_2018_h2 =  (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] > '2017-06-01') & (training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT_DATETIME'] < '2018-01-01') & (training_data['TARGET_EVENT_BINARY'] != 1)

print('Adter scaling:')
new_proba[mask_2016_h1] = 0.005
new_proba[mask_2016_h2] = 0.0055
new_proba[mask_2017_h1] = 0.0085
new_proba[mask_2017_h2] = 0.0105
new_proba[mask_2018_h1] = 0.0148
new_proba[mask_2018_h2] = 0.0158
print(new_proba.mean())

Adter scaling:
0.01034126829426665


In [62]:
training_data['YEAR_SCALED_PROBA'] = new_proba

In [63]:
submission = create_submission_file(training_data, 'YEAR_SCALED_PROBA', data_submission_example, filename='./predictions/2017-h2-95.csv')

Before centering:
0.27874487153015237 0.0050000000000000044 0.014561668498440646
Centering probabilities...
0.27874487153015237 0.0050000000000000044 0.014561668498440646
Saved file: ./predictions/2017-h2-95.csv


In [22]:
# 1.48% {"Logloss":   0.0757300655777955, "ROCAUC": 0.5}
# 1.4477% {"Logloss": 0.0757266853599001, "ROCAUC": 0.5}
submission['PRED'] = 0.0148
submission.to_csv('./predictions/148.csv', index=False)