In [105]:
import os
import pandas as pd 
import glob
import random 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import math

In [106]:
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',680)

## Read in loan outcomes: 

In [132]:
outcomes = pd.read_csv('loan_outcomes_33_150.csv', dtype = {'new_account_ids': 'object'})

In [133]:
outcomes.shape

(344, 4)

In [134]:
outcomes.head()

Unnamed: 0,m_ids_owner,new_account_ids,paid_back_in_150_days,every_payment_under_33_days
0,65134441430300000,5.61348043746e+16,1.0,1.0
1,60134547419200000,6.01374307109e+16,0.0,0.0
2,63134425702500000,6.31367405497e+16,0.0,1.0
3,64134429266300000,8.91364591805e+16,1.0,1.0
4,65134432186900000,1.051365401401e+17,1.0,1.0


In [135]:
len(outcomes.m_ids_owner.unique())

139

## First categorize loans:

In [136]:
outcomes.shape[1]

4

In [137]:
# outcomes.insert(outcomes.shape[1],"loan_cat", np.zeros(outcomes.shape[0]))

In [138]:
outcomes.dtypes

m_ids_owner                     object
new_account_ids                 object
paid_back_in_150_days          float64
every_payment_under_33_days    float64
dtype: object

In [139]:
outcomes = outcomes[['m_ids_owner', 'new_account_ids', 'every_payment_under_33_days', 
       'paid_back_in_150_days']]

In [140]:
outcomes.head()

Unnamed: 0,m_ids_owner,new_account_ids,every_payment_under_33_days,paid_back_in_150_days
0,65134441430300000,5.61348043746e+16,1.0,1.0
1,60134547419200000,6.01374307109e+16,0.0,0.0
2,63134425702500000,6.31367405497e+16,1.0,0.0
3,64134429266300000,8.91364591805e+16,1.0,1.0
4,65134432186900000,1.051365401401e+17,1.0,1.0


In [141]:
outcomes.shape

(344, 4)

## This creates a binary outcome per loan: 

It will give loans a score of 1 if they meet either of our criteria, else 0m


In [142]:
def score_loan_binary(outcomes, x_days, thresh): 
    # x_days = paid back in x days (e.g. 150)
    # thresh = regularity threshold (e.g. 33 days)
    new_outcomes = outcomes.copy()
    
    key_x_days = 'paid_back_in_'+str(x_days)+"_days"
    key_thresh = 'every_payment_under_'+str(thresh)+'_days'
    
    #insert loan category 
    new_outcomes.insert(new_outcomes.shape[1],"binary_loan_score", np.zeros(new_outcomes.shape[0]))
    
    for idx, row in new_outcomes.iterrows():
        
        if row[key_x_days] == 1:
            new_outcomes.at[idx,'binary_loan_score'] = 1
        else: 
            if row[key_thresh] == 1: 
                new_outcomes.at[idx,'binary_loan_score'] = 1
            else: 
                new_outcomes.at[idx,'binary_loan_score'] = 0
    return new_outcomes 

In [143]:
new_outcomes = score_loan_binary(outcomes, 150, 33)

In [144]:
new_outcomes.binary_loan_score.value_counts()

1.0    278
0.0     66
Name: binary_loan_score, dtype: int64

In [145]:
outcomes.shape

(344, 4)

In [146]:
new_outcomes.shape

(344, 5)

In [147]:
binary_outcomes = new_outcomes.copy()

## This creates a scaled score for a loan, weighting things however you like: 

In [148]:
def score_loan_continuous(outcomes, x_days, thresh, weight_regularity = 0.5): 
    # x_days = paid back in x days (e.g. 150)
    # thresh = regularity threshold (e.g. 33 days)
    # weight_regularity = do we want to weight regularity a bit higher than weight_paidback? default = 0.5
    
    weight_paidback = 1-weight_regularity 
    
    new_outcomes = outcomes.copy()
    
    # determines the keys 
    key_x_days = 'paid_back_in_'+str(x_days)+"_days"
    key_thresh = 'every_payment_under_'+str(thresh)+'_days'
    
    #insert loan category 
    new_outcomes.insert(new_outcomes.shape[1],"continuous_loan_score", np.zeros(new_outcomes.shape[0]))
    
    for idx, row in new_outcomes.iterrows():
        if row[key_x_days] == 1:
            if row[key_thresh] == 1: 
                new_outcomes.at[idx,'continuous_loan_score'] = 1
            else: 
                new_outcomes.at[idx,'continuous_loan_score'] = weight_paidback
        else: 
            if row['every_payment_under_33_days'] == 1: 
                new_outcomes.at[idx,'continuous_loan_score'] = weight_regularity
            else: 
                new_outcomes.at[idx,'continuous_loan_score'] = 0
    return new_outcomes 

In [149]:
cont_outcomes = score_loan_continuous(binary_outcomes, 150, 33, 0.5)

In [150]:
cont_outcomes.shape

(344, 6)

In [151]:
cont_outcomes.head()

Unnamed: 0,m_ids_owner,new_account_ids,every_payment_under_33_days,paid_back_in_150_days,binary_loan_score,continuous_loan_score
0,65134441430300000,5.61348043746e+16,1.0,1.0,1.0,1.0
1,60134547419200000,6.01374307109e+16,0.0,0.0,0.0,0.0
2,63134425702500000,6.31367405497e+16,1.0,0.0,1.0,0.5
3,64134429266300000,8.91364591805e+16,1.0,1.0,1.0,1.0
4,65134432186900000,1.051365401401e+17,1.0,1.0,1.0,1.0


## This creates outcomes on a per-person basis: 

In [152]:
outcomes_perperson = pd.DataFrame()

In [159]:
def create_perperson_outcomes(outcomes_total, binary_or_cont = 'binary', good_thresh = 0.75, bad_thresh = 0.25): 
    
    outcomes_perperson = pd.DataFrame()
    if binary_or_cont == 'binary': 
        for m_ids_owner in list(outcomes_total.m_ids_owner.unique()): 
            
            # take all loans for that person :
            owner_df = outcomes_total.loc[outcomes_total['m_ids_owner']==m_ids_owner]
            
            # calc mean of loan paying 
            owner_score = np.mean(owner_df.binary_loan_score)
            
            # add that to outcomes_perperson DF
            outcomes_perperson_line = {'m_ids_owner': m_ids_owner, 'loans': list(owner_df.new_account_ids), 'owner_score': owner_score}
            outcomes_perperson = outcomes_perperson.append(outcomes_perperson_line, ignore_index= True)
    
        outcomes_perperson['binarize_score'] = [1 if x >= good_thresh else 0 if x <= bad_thresh else 999 for x in outcomes_perperson.owner_score]
    
    else: 
        for m_ids_owner in list(outcomes_total.m_ids_owner.unique()): 
            
            # take all loans for that person :
            owner_df = outcomes_total.loc[outcomes_total['m_ids_owner']==m_ids_owner]
            
            # calc mean of loan paying 
            owner_score = np.mean(owner_df.continuous_loan_score)
            
            # add that to outcomes_perperson DF
            outcomes_perperson_line = {'m_ids_owner': m_ids_owner, 'loans': list(owner_df.new_account_ids), 'owner_score': owner_score}
            outcomes_perperson = outcomes_perperson.append(outcomes_perperson_line, ignore_index= True)
    
        outcomes_perperson['binarize_score'] = [1 if x >= good_thresh else 0 if x <= bad_thresh else 999 for x in outcomes_perperson.owner_score]
    
    return outcomes_perperson



In [160]:
outcomes_perperson = create_perperson_outcomes(cont_outcomes, 'binary')

In [163]:
outcomes_perperson.binarize_score.value_counts()

1      90
999    25
0      24
Name: binarize_score, dtype: int64

In [166]:
outcomes_perperson.to_csv('individual_outcomes_33_150.csv', index = False)