In [1]:
import os
import pandas as pd 
import glob
import random 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import math

In [2]:
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',680)

## Read in outcomes: 

In [3]:
outcomes = pd.read_csv('everyPaymentUnder33_paidWithin150.csv', dtype = {'new_account_ids': 'object'})

In [4]:
outcomes.shape

(106, 4)

In [5]:
outcomes.head()

Unnamed: 0,every_payment_under_33_days,m_ids_owner,new_account_ids,paid_back_in_150_days
0,0.0,60134547419200000,6.01374307109e+16,0.0
1,1.0,63134425702500000,6.31367405497e+16,0.0
2,1.0,65134432186900000,1.051365401401e+17,1.0
3,0.0,65134442822400000,5.61347619278e+16,0.0
4,0.0,61134406652800000,1.121372833516e+17,0.0


In [6]:
outcomes

Unnamed: 0,every_payment_under_33_days,m_ids_owner,new_account_ids,paid_back_in_150_days
0,0.0,60134547419200000,6.01374307109e+16,0.0
1,1.0,63134425702500000,6.31367405497e+16,0.0
2,1.0,65134432186900000,1.051365401401e+17,1.0
3,0.0,65134442822400000,5.61347619278e+16,0.0
4,0.0,61134406652800000,1.121372833516e+17,0.0
5,0.0,57134407687800000,5.71346491434e+16,0.0
6,0.0,56134397318700000,1.051370493199e+17,0.0
7,1.0,57134320447800000,5.71365578585e+16,0.0
8,0.0,57134339147100000,5.71347027862e+16,0.0
9,1.0,61134406243400000,6.11347705996e+16,0.0


In [7]:
len(outcomes.m_ids_owner.unique())

58

## First categorize loans:

In [8]:
outcomes.insert(4,"loan_cat", np.zeros(outcomes.shape[0]))

In [9]:
outcomes.dtypes

every_payment_under_33_days    float64
m_ids_owner                     object
new_account_ids                 object
paid_back_in_150_days           object
loan_cat                       float64
dtype: object

In [10]:
outcomes_allborrowings = outcomes.loc[outcomes['paid_back_in_150_days'] != 'too_few_borrowings'] 

In [11]:
outcomes_allborrowings = outcomes_allborrowings.astype({'paid_back_in_150_days': 'float64'})
outcomes_allborrowings.dtypes

every_payment_under_33_days    float64
m_ids_owner                     object
new_account_ids                 object
paid_back_in_150_days          float64
loan_cat                       float64
dtype: object

In [12]:
outcomes_allborrowings.columns

Index(['every_payment_under_33_days', 'm_ids_owner', 'new_account_ids',
       'paid_back_in_150_days', 'loan_cat'],
      dtype='object')

In [13]:
outcomes_allborrowings = outcomes_allborrowings[['m_ids_owner', 'new_account_ids', 'every_payment_under_33_days', 
       'paid_back_in_150_days', 'loan_cat']]

In [14]:
outcomes_allborrowings.head()

Unnamed: 0,m_ids_owner,new_account_ids,every_payment_under_33_days,paid_back_in_150_days,loan_cat
0,60134547419200000,6.01374307109e+16,0.0,0.0,0.0
1,63134425702500000,6.31367405497e+16,1.0,0.0,0.0
2,65134432186900000,1.051365401401e+17,1.0,1.0,0.0
3,65134442822400000,5.61347619278e+16,0.0,0.0,0.0
4,61134406652800000,1.121372833516e+17,0.0,0.0,0.0


In [15]:
outcomes_allborrowings.shape

(94, 5)

In [33]:
# for idx, row in outcomes_allborrowings.iterrows():
#     if row['paid_back_in_150_days'] == 1:
#         if row['every_payment_under_33_days'] == 1: 
#             outcomes_allborrowings.at[idx,'loan_cat'] = 1
#         else: 
#             outcomes_allborrowings.at[idx,'loan_cat'] = 0.4
#     else: 
#         if row['every_payment_under_33_days'] == 1: 
#             outcomes_allborrowings.at[idx,'loan_cat'] = 0.6
#         else: 
#             outcomes_allborrowings.at[idx,'loan_cat'] = 0

In [45]:
for idx, row in outcomes_allborrowings.iterrows():
    if row['paid_back_in_150_days'] == 1:
        outcomes_allborrowings.at[idx,'loan_cat'] = 1
    else: 
        if row['every_payment_under_33_days'] == 1: 
            outcomes_allborrowings.at[idx,'loan_cat'] = 1
        else: 
            outcomes_allborrowings.at[idx,'loan_cat'] = 0

In [46]:
outcomes_allborrowings.loan_cat.value_counts()

1.0    57
0.0    37
Name: loan_cat, dtype: int64

In [47]:
outcomes_perperson = pd.DataFrame()

In [48]:
for m_ids_owner in list(outcomes_allborrowings.m_ids_owner.unique()): 
    
    # take all loans for that person :
    owner_df = outcomes_allborrowings.loc[outcomes_allborrowings['m_ids_owner']==m_ids_owner]
    
    # calc mean of loan paying 
    owner_score = np.mean(owner_df.loan_cat)
    
    # add that to outcomes_perperson DF
    outcomes_perperson_line = {'m_ids_owner': m_ids_owner, 'loans': list(owner_df.new_account_ids), 'owner_score': owner_score}
    
    outcomes_perperson = outcomes_perperson.append(outcomes_perperson_line, ignore_index= True)

In [49]:
outcomes_perperson

Unnamed: 0,loans,m_ids_owner,owner_score
0,"[60137430710900000.1, 60134978362300000.1]",60134547419200000,0.5
1,[63136740549700000.1],63134425702500000,1.0
2,"[105136540140100000.1, 105136540140100000.2]",65134432186900000,1.0
3,"[56134761927800000.1, 56135201879700000.1]",65134442822400000,0.5
4,[112137283351600000.1],61134406652800000,0.0
5,[57134649143400000.1],57134407687800000,0.0
6,"[105137049319900000.1, 105137629836900000.1]",56134397318700000,0.5
7,"[57136557858500000.2, 57136557858500000.1, 571...",57134320447800000,1.0
8,[57134702786200000.1],57134339147100000,0.0
9,[61134770599600000.1],61134406243400000,1.0


In [50]:
outcomes_perperson['owner_score_binarize'] = [1 if x < 0.5 else 0 for x in outcomes_perperson.owner_score ]

In [51]:
outcomes_perperson

Unnamed: 0,loans,m_ids_owner,owner_score,owner_score_binarize
0,"[60137430710900000.1, 60134978362300000.1]",60134547419200000,0.5,0
1,[63136740549700000.1],63134425702500000,1.0,0
2,"[105136540140100000.1, 105136540140100000.2]",65134432186900000,1.0,0
3,"[56134761927800000.1, 56135201879700000.1]",65134442822400000,0.5,0
4,[112137283351600000.1],61134406652800000,0.0,1
5,[57134649143400000.1],57134407687800000,0.0,1
6,"[105137049319900000.1, 105137629836900000.1]",56134397318700000,0.5,0
7,"[57136557858500000.2, 57136557858500000.1, 571...",57134320447800000,1.0,0
8,[57134702786200000.1],57134339147100000,0.0,1
9,[61134770599600000.1],61134406243400000,1.0,0
