In [1]:
# Importing the necessary libraries for data manipulation and reading
import pandas as pd
# Importing necessary libraries for logistic regression and scaling
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import log_loss
# Load the data
training_data = pd.read_csv('training_data.csv')

# Drop rows where BORROWER_ID is 'xNullx'
training_data = training_data[training_data['BORROWER_ID'] != 'xNullx']

# Shuffle the DataFrame
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)

data_submission_example = pd.read_csv('data_submission_example.csv')


In [None]:
lognormal_variables = [
    'CONTRACT_CREDIT_LOSS', 'CONTRACT_DEPT_SERVICE_TO_INCOME',
    'CONTRACT_INCOME', 'CONTRACT_INSTALMENT_AMOUNT', 'CONTRACT_INSTALMENT_AMOUNT_2',
    'CONTRACT_LOAN_AMOUNT', 'CONTRACT_MARKET_VALUE', 'CONTRACT_MORTGAGE_LENDING_VALUE',
]

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Assuming training_data =  training_data.sample(n=10000, random_state=1)

# Filling NaN values with 0
training_data.fillna(0, inplace=True)

# Converting columns to numeric where possible
for col in training_data.columns:
    try:
        training_data[col] = pd.to_numeric(training_data[col], errors='ignore')
    except:
        continue

# Creating a sample target variable
training_data['TARGET_EVENT_BINARY'] = np.where(training_data['TARGET_EVENT'] == 'K', 1, 0)


In [3]:
training_data['TARGET_EVENT_DAY'] = pd.to_datetime(training_data['TARGET_EVENT_DAY'])
training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT'] = pd.to_datetime(training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT'])

# Calculate the day difference
training_data['DAY_DIFF'] = (training_data['TARGET_EVENT_DAY'] - training_data['CONTRACT_DATE_OF_LOAN_AGREEMENT']).dt.days

# Create TARGET_EVENT_BINARY_2Y based on conditions
training_data['TARGET_EVENT_BINARY_2Y'] = np.where(
    (training_data['TARGET_EVENT'] == 'K') & 
    (training_data['DAY_DIFF'] <= 730) & 
    (training_data['DAY_DIFF'] >= 0), 
    1, 
    0
)

# Drop the temporary 'DAY_DIFF' column if needed
training_data.drop('DAY_DIFF', axis=1, inplace=True)

In [4]:
# Identify numeric columns
numeric_columns = training_data.select_dtypes(include=[np.number]).columns.tolist()

# Identify target columns that shouldn't be in the X variables
excluded_keywords = ['TARGET', 'event', 'binary']

# Create lists for X variable columns and target column
X_columns = [col for col in numeric_columns if all(keyword.lower() not in col.lower() for keyword in excluded_keywords)]
y_column = 'TARGET_EVENT_BINARY_2Y' 

In [5]:
threshold = 0.85  # Set your own threshold
correlation_matrix = training_data[X_columns].corr()
# Get pairs of highly correlated features
highly_correlated_set = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            highly_correlated_set.add(colname)

# Remove highly correlated features from X_columns
X_columns = [col for col in X_columns if col not in highly_correlated_set]
print('Variables removed:', highly_correlated_set)

Variables removed: {'BORROWER_COUNTRY', 'CONTRACT_REFINANCED'}


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, log_loss, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

def train_and_predict_two_halves(df, variables, target, model1=LogisticRegression(), model2=LogisticRegression(), scaler=StandardScaler()):
    # Split the dataframe into two halves
    half = len(df) // 2
    df1, df2 = df.iloc[:half], df.iloc[half:]

    # Scale the entire dataset using a single scaler
    X_scaled = scaler.fit_transform(df[variables])
    X1_scaled = X_scaled[:half]
    X2_scaled = X_scaled[half:]
    
    # Prepare target variables
    y1 = df1[target]
    y2 = df2[target]
    
    # Train model1 on df1 and get probabilities on df2
    model1.fit(X1_scaled, y1)
    y2_prob = model1.predict_proba(X2_scaled)[:, 1]
    
    # Train model2 on df2 and get probabilities on df1
    model2.fit(X2_scaled, y2)
    y1_prob = model2.predict_proba(X1_scaled)[:, 1]
    
    # Evaluate model1 on df2
    print("Evaluation of Model 1 on df2:")
    print("Classification Report:")
    print(classification_report(y2, y2_prob >= 0.5))
    print("Log Loss:")
    print(log_loss(y2, y2_prob))
    print("Confusion Matrix:")
    print(confusion_matrix(y2, y2_prob >= 0.5))

    # Evaluate model2 on df1
    print("Evaluation of Model 2 on df1:")
    print("Classification Report:")
    print(classification_report(y1, y1_prob >= 0.5))
    print("Log Loss:")
    print(log_loss(y1, y1_prob))
    print("Confusion Matrix:")
    print(confusion_matrix(y1, y1_prob >= 0.5))
    
    # Join the predicted probabilities
    joined_prob = np.concatenate([y1_prob, y2_prob])
    
    return joined_prob


In [7]:
import pandas as pd
import numpy as np

def combined_probability(s):
    return 1 - np.prod(1 - s.values)

def create_submission_file(df_preds, target, filename='submission.csv'):
    # Filter the data to only include BORROWER_IDs that are in the submission example
    filtered_training_data = df_preds[df_preds['BORROWER_ID'].isin(data_submission_example['BORROWER_ID'])]

    # Print warning if the row count is off
    if len(filtered_training_data) != 1564601:
        print('WARNING: The filtered data does not have the correct number of rows. Make sure you are not using the training data for submission.')
        raise ValueError('WARNING: The submission file does not have the correct number of rows. Make sure you are not using the training data for submission.')
    # Group by BORROWER_ID and calculate the combined probability
    grouped_data = filtered_training_data.groupby('BORROWER_ID')[target].apply(combined_probability).reset_index()

    # Create the submission DataFrame
    df_submission = pd.DataFrame()
    df_submission['BORROWER_ID'] = grouped_data['BORROWER_ID']
    df_submission['PRED'] = grouped_data[target]
    print('Centering probabilities...')
    # Center the probabilities around 1.48%
    desired_mean = 0.0148  # 1.48% as a decimal
    while (df_submission['PRED'].max() > 1 or df_submission['PRED'].min() < 0 or abs(df_submission['PRED'].mean() -0.0148) > 0.0005):
        # print(df_submission['PRED'].max(), df_submission['PRED'].min(), df_submission['PRED'].mean())
        df_submission['PRED'] = df_submission['PRED'].clip(lower=0, upper=1)
        # print(df_submission['PRED'].max(), df_submission['PRED'].min(), df_submission['PRED'].mean())

        current_mean = df_submission['PRED'].mean()
        adjustment_factor = desired_mean  - current_mean
        df_submission['PRED'] += adjustment_factor
    print(df_submission['PRED'].max(), df_submission['PRED'].min(), df_submission['PRED'].mean())
    # Save the submission file
    df_submission.to_csv(filename, index=False)
    print(f'Saved file: {filename}')
    if abs(df_submission['PRED'].mean() -0.0148) > 0.0005:
       raise ValueError('WARNING: mean is bad')
        
    # Print warning if the row count is off
    if len(df_submission) != 1117674:
        print('WARNING: The submission file does not have the correct number of rows. Make sure you are not using the training data for submission.')
        raise ValueError('WARNING: The submission file does not have the correct number of rows. Make sure you are not using the training data for submission.')
        
    return df_submission


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, log_loss, confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
predicted_probs = 'LOGISTIC_REG'
probs = train_and_predict_two_halves(
    training_data, 
    X_columns, 
    y_column, 
    model1=LogisticRegression(max_iter=400, C=0.5, random_state=42),
    model2=LogisticRegression(max_iter=400, C=0.5, random_state=42),
)
training_data[predicted_probs] = probs
submission = create_submission_file(training_data, predicted_probs, filename='./predictions/logistic-regression-independent-centered-2y-exp-no-multicolinearity.csv')

Evaluation of Model 1 on df2:
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    795431
           1       0.37      0.05      0.09      5401

    accuracy                           0.99    800832
   macro avg       0.68      0.52      0.54    800832
weighted avg       0.99      0.99      0.99    800832

Log Loss:
0.027630236157242454
Confusion Matrix:
[[794990    441]
 [  5140    261]]
Evaluation of Model 2 on df1:
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    795363
           1       0.35      0.04      0.08      5468

    accuracy                           0.99    800831
   macro avg       0.67      0.52      0.54    800831
weighted avg       0.99      0.99      0.99    800831

Log Loss:
0.028111056006007516
Confusion Matrix:
[[794925    438]
 [  5231    237]]
Centering probabilities...
1.0 0.006901678874002306 0.014799999999999999

In [13]:
predicted_probs = 'RANDOM_FOREST'
probs = train_and_predict_two_halves(
    training_data, 
    X_columns, 
    y_column, 
    model1=RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    model2=RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
)
training_data[predicted_probs] = probs
submission = create_submission_file(training_data, predicted_probs, filename='./predictions/random-forrest-independent-centered-2y-exp-no-multicolinearity.csv')

Evaluation of Model 1 on df2:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    795431
           1       0.85      0.59      0.70      5401

    accuracy                           1.00    800832
   macro avg       0.92      0.80      0.85    800832
weighted avg       1.00      1.00      1.00    800832

Log Loss:
0.00934382490057665
Confusion Matrix:
[[794863    568]
 [  2201   3200]]
Evaluation of Model 2 on df1:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    795363
           1       0.84      0.57      0.68      5468

    accuracy                           1.00    800831
   macro avg       0.92      0.79      0.84    800831
weighted avg       1.00      1.00      1.00    800831

Log Loss:
0.00967592115718422
Confusion Matrix:
[[794784    579]
 [  2344   3124]]
Centering probabilities...
1.0 0.010638995202324 0.014800000000000006
Save

In [14]:
predicted_probs = 'GRADIENT_BOOSTING_CLASSIFIER'
probs = train_and_predict_two_halves(
    training_data, 
    X_columns, 
    y_column, 
    model1=GradientBoostingClassifier(random_state=42),
    model2=GradientBoostingClassifier(random_state=42),
)
training_data[predicted_probs] = probs
submission = create_submission_file(training_data, predicted_probs, filename='./predictions/gbc-independent-centered-2y-exp-no-multicolinearity.csv')

Evaluation of Model 1 on df2:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    795431
           1       0.70      0.65      0.68      5401

    accuracy                           1.00    800832
   macro avg       0.85      0.83      0.84    800832
weighted avg       1.00      1.00      1.00    800832

Log Loss:
0.011550904577582512
Confusion Matrix:
[[793952   1479]
 [  1878   3523]]
Evaluation of Model 2 on df1:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    795363
           1       0.71      0.64      0.67      5468

    accuracy                           1.00    800831
   macro avg       0.85      0.82      0.84    800831
weighted avg       1.00      1.00      1.00    800831

Log Loss:
0.012012593571029772
Confusion Matrix:
[[793918   1445]
 [  1967   3501]]
Centering probabilities...
1.0 0.0112039827624706 0.014799999999999999
S

In [12]:
import xgboost as xgb
predicted_probs = 'XGBOOST'
probs = train_and_predict_two_halves(
    training_data, 
    X_columns, 
    y_column, 
    model1= xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    model2= xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
)
training_data[predicted_probs] = probs
submission = create_submission_file(training_data, predicted_probs, filename='./predictions/xgboost-independent-centered-2y-exp-no-multicolinearity.csv')

Evaluation of Model 1 on df2:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    795431
           1       0.83      0.76      0.80      5401

    accuracy                           1.00    800832
   macro avg       0.92      0.88      0.90    800832
weighted avg       1.00      1.00      1.00    800832

Log Loss:
0.006836557697863962
Confusion Matrix:
[[794616    815]
 [  1285   4116]]
Evaluation of Model 2 on df1:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    795363
           1       0.83      0.76      0.79      5468

    accuracy                           1.00    800831
   macro avg       0.91      0.88      0.89    800831
weighted avg       1.00      1.00      1.00    800831

Log Loss:
0.0069370294212075705
Confusion Matrix:
[[794514    849]
 [  1332   4136]]
Centering probabilities...
1.0 0.012441159350281004 0.01480000000000000

In [13]:
df = training_data.sample(n=100, random_state=1)
len(df)

100

In [14]:
enumerate(df.groupby('BORROWER_ID'))

<enumerate at 0x1fe21d74090>

In [15]:
groups_by_size = {}

# Initialize a dictionary to hold covariance matrices by group size
cov_matrices_by_size = {}

# Group by 'BORROWER_ID' and iterate through the first 100 groups
for idx, (name, group) in enumerate(training_data.groupby('BORROWER_ID')):
    if idx % 100 == 0: 
        print('At index', idx)
        
    if idx == 50000:
        break
    
    # Sort the group by CONTRACT_DATE_OF_LOAN_AGREEMENT
    group = group.sort_values(by='CONTRACT_DATE_OF_LOAN_AGREEMENT')
    
    group_size = len(group)
    if group_size not in groups_by_size:
        groups_by_size[group_size] = []
        
    groups_by_size[group_size].append(group)

# Initialize a new dictionary to hold the merged DataFrames by size
merged_groups_by_size = {}

for size, dfs in groups_by_size.items():
    # Merge all DataFrames of the same size into a single DataFrame
    merged_df = pd.concat(dfs, ignore_index=True)
    # Store the merged DataFrame in the new dictionary
    merged_groups_by_size[size] = merged_df

# Now, merged_groups_by_size contains the merged DataFrames categorized by group size


In [20]:
import json
import numpy as np
import pandas as pd

# Your code to generate groups_by_size and merged_groups_by_size

cov_matrices_by_size = {}

for size, merged_df in merged_groups_by_size.items():
    if size > 1:  # Covariance matrix for single-element arrays doesn't make sense
        try:
            cov_matrix = np.cov(merged_df['TARGET_EVENT_BINARY'].values.reshape(size,-1))
            if not np.isnan(cov_matrix).any():  # Check for NaN values
                cov_matrices_by_size[size] = cov_matrix.tolist()  # Convert numpy array to list for JSON serialization
        except Exception as e:
            print(f"An error occurred while calculating the covariance matrix for size {size}: {e}")

# Ensure that the dictionary contains only JSON-serializable items
serializable_cov_matrices_by_size = {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in cov_matrices_by_size.items()}

# Save to JSON
with open('./data/cov_matrices_by_size.json', 'w') as f:
    json.dump(serializable_cov_matrices_by_size, f)


  cov_matrix = np.cov(merged_df['TARGET_EVENT_BINARY'].values.reshape(size,-1))
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [21]:
cov_matrices_by_size

{2: [[0.005868589752778426, -3.2357827075016176e-05],
  [-3.2357827075016176e-05, 0.005451716043799604]],
 3: [[0.007337144521720125, -2.6009019928111114e-05, -2.6009019928111046e-05],
  [-2.6009019928111114e-05, 0.003507502116019551, -1.2385247584814735e-05],
  [-2.6009019928111046e-05, -1.2385247584814735e-05, 0.003507502116019557]],
 4: [[0.0023364421983921592,
   -1.3695440787761734e-05,
   -5.478176315104693e-06,
   -8.217264472657053e-06],
  [-1.3695440787761734e-05,
   0.005820562334798763,
   -1.3695440787761748e-05,
   -2.054316118164268e-05],
  [-5.478176315104693e-06,
   -1.3695440787761748e-05,
   0.0023364421983921558,
   -8.217264472657026e-06],
  [-8.217264472657053e-06,
   -2.054316118164268e-05,
   -8.217264472657026e-06,
   0.003500554665351909]],
 8: [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 

In [34]:
# Save to JSON
import json

with open('cov_matrices_by_size.json', 'w') as f:
    json.dump(cov_matrices_by_size, f)