In [1]:
platform = 'lendingclub'

store = pd.HDFStore(
    '/Users/justinhsi/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)
loan_info = store['loan_info_clean']
store.close()

In [2]:
loan_info.loan_status.value_counts(dropna=False)

current        741936
paid           509707
charged_off    130191
late_120        19738
grace_15        12762
late_30          4121
defaulted          24
Name: loan_status, dtype: int64

# do some investigations on done loans

In [8]:
done_loans = loan_info[loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted'])]
old_done_loans = done_loans[done_loans['maturity_time'] >= 1.0]
odl_36 = done_loans[done_loans['term'] == 36]
odl_60 = done_loans[done_loans['term'] == 60]

# Use maturity_paid, which will be 0 for loans that made no payments and defaulted regardless of how old it is.
# Bin maturity_paid into 36 and 60 bins depending on term

In [32]:
def make_bins(term, df, col):
    bins = np.arange(0,1+(1/term),1/term)
    labels = np.arange(1,term+1)
    series = pd.cut(df[col], bins=bins, labels=labels, include_lowest = True)
    return series

In [33]:
odl_36['binned_matp'] = make_bins(36, odl_36, 'maturity_paid')
odl_60['binned_matp'] = make_bins(60, odl_60, 'maturity_paid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Calculate historical prob of default based on just grade and term

In [52]:
def historical_prob_def(df):
    grade_grouped = df.groupby('grade')
    results_dict = {}
    for grade, group in grade_grouped:
        paid = group[group['loan_status'] == 'paid']
        results_dict[grade] = 1-(len(paid)/len(group))
    return pd.Series(results_dict)

def new_likelihoods(df):
    bins = df['binned_matp'].unique()
    results_dict = {}
    grade_grouped = df.groupby('grade')
    for grade, group in tqdm_notebook(grade_grouped):
        results = {}
        for i in bins:
            subset = group[group['binned_matp'] >= i]
            paid = subset[subset['loan_status'] == 'paid']
            results[i] = 1-(len(paid)/len(subset))
        results_dict[grade] = pd.Series(results)
    return pd.DataFrame(results_dict).T

In [53]:
priors = {}
priors[36] = historical_prob_def(odl_36)
priors[60] = historical_prob_def(odl_60)

In [54]:
likelihood = {}
likelihood[36] = new_likelihoods(odl_36)
likelihood[60] = new_likelihoods(odl_60)







In [55]:
likelihood[36]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
A,0.064241,0.063758,0.062985,0.062025,0.060791,0.059075,0.057033,0.055019,0.052531,0.049808,0.046965,0.044356,0.041577,0.038684,0.035319,0.032527,0.029798,0.026853,0.024335,0.021936,0.019463,0.017142,0.014924,0.013057,0.011235,0.00951,0.007946,0.006629,0.005298,0.00431,0.00332,0.00254,0.001853,0.001176,0.000573,0.00017
B,0.124775,0.123735,0.122565,0.120887,0.118417,0.115136,0.111749,0.107788,0.102142,0.096558,0.090972,0.08526,0.079597,0.073522,0.067545,0.062254,0.056517,0.051224,0.046178,0.041603,0.036825,0.032495,0.028262,0.02487,0.021428,0.018274,0.015586,0.013004,0.010786,0.008714,0.006551,0.004734,0.003416,0.002047,0.000928,0.000193
C,0.204246,0.20244,0.200318,0.19724,0.19284,0.187507,0.180907,0.173087,0.163397,0.152979,0.142746,0.131848,0.122196,0.111719,0.101183,0.09155,0.082374,0.073136,0.064893,0.057616,0.050234,0.043883,0.038411,0.032813,0.028189,0.023695,0.019725,0.016414,0.013344,0.010509,0.008076,0.005955,0.004073,0.00254,0.001301,0.000366
D,0.265442,0.262515,0.259325,0.254978,0.248755,0.241232,0.232052,0.219837,0.20656,0.193162,0.179555,0.167474,0.154839,0.140143,0.127862,0.115858,0.104121,0.092683,0.08251,0.073268,0.063856,0.056532,0.048753,0.041381,0.034746,0.029431,0.024788,0.020161,0.017018,0.013309,0.010631,0.007652,0.005354,0.003252,0.001556,0.000374
E,0.335187,0.330734,0.325901,0.32071,0.312419,0.301949,0.288393,0.271573,0.254373,0.236569,0.218276,0.199637,0.181294,0.163954,0.148252,0.133125,0.118851,0.105414,0.091303,0.080656,0.070432,0.061564,0.052526,0.045098,0.039144,0.031136,0.025823,0.0206,0.016004,0.012814,0.010139,0.007065,0.004747,0.002963,0.001172,0.000547
F,0.382549,0.376536,0.37011,0.362192,0.351267,0.336704,0.316993,0.297361,0.274616,0.253056,0.233029,0.214891,0.196354,0.183915,0.16646,0.147972,0.129773,0.113127,0.098558,0.088165,0.07753,0.066967,0.057483,0.05083,0.044081,0.036891,0.031689,0.026078,0.02147,0.015018,0.011397,0.007749,0.004443,0.002226,0.000743,0.000372
G,0.464752,0.454061,0.445196,0.436039,0.417614,0.38806,0.371166,0.331158,0.3086,0.280702,0.246324,0.223485,0.194499,0.173387,0.161554,0.151139,0.145833,0.133192,0.122056,0.108696,0.094923,0.082774,0.070295,0.061785,0.053118,0.046512,0.030733,0.028436,0.019139,0.016787,0.012048,0.007264,0.004854,0.0,0.0,0.0


# Given grade, term, and maturity_paid of ongoing loans, I should be able to predict an updated probability of default based on their maturity_paid

In [21]:
odl_36[['binned_matp', 'maturity_paid']]

Unnamed: 0_level_0,binned_matp,maturity_paid
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1077501,36.0,1.000000
1077175,36.0,1.000000
1076863,36.0,1.000000
1075269,36.0,1.000000
1072053,36.0,1.000000
1069908,36.0,1.000000
1064687,8.0,0.198993
1069866,36.0,1.000000
1069057,22.0,0.604028
1069759,36.0,1.000000


# Random

In [56]:
# Let's say you have a fair coin.
# Let's say you don't know whether or not it's actually biased towards heads by a certain degree.
# Let's say you can flip it at most a certain number of times.
# What's the probability that, at some point along the way, it looks pretty likely to be biased?
from random import random
from scipy.misc import comb

# The actual bias of the coin
BIAS = 0.5
# The number of times to flip the coin
FLIPS = 300
# All hypotheses will be compared only to the final hypothesis
HYPOTHESES = (0.75, 0.55, 0.5)
# We'll check how often the likelihood ratio goes above these values
THRESHOLDS = (20, 100)
# Ignore extreme odds that occur before at least this much data has been collected:
LARGE_NUMBER = 0
# We'll run this many squences of FLIPS flips to approximate the probability of exceeding the above thresholds.
RUNS = 10000

def gen_sequence():
    for i in range(FLIPS):
        yield 'H' if random() < BIAS else 'T'

def odds_sequence(sequence):
    n_heads = 0
    n_flips = 0
    for flip in sequence:
        n_flips += 1
        if flip == 'H':
            n_heads += 1
        yield tuple(b**n_heads * (1-b)**(n_flips - n_heads) for b in HYPOTHESES)

def relative_odds(odds, i, j):
    return odds[i] / odds[j] if odds[j] > 0 else float('inf')

def most_extreme_odds(i, odds_list):
    assert odds_list
    rel_odds = [relative_odds(o, i, -1) for o in odds_list]
    indexed_odds = list(enumerate(rel_odds))[LARGE_NUMBER:]
    
    max_i, max_odds = max(indexed_odds, key=lambda io: io[1])
    min_i, min_odds = min(indexed_odds, key=lambda io: io[1])
    return (max_odds, max_i + 1, min_odds, min_i + 1, rel_odds[-1], len(rel_odds))

# Generates a report for a single run of the data.
# (Not used by default, but you can use it yourself to inspect a single run.)
def report(odds_list):
    for i in range(len(HYPOTHESES) - 1):
        max_odds, max_n, min_odds, min_n, final_odds, final_n = most_extreme_odds(i, odds_list)
        print('Odds that the coin was {:2.0f}% biased towards heads (as opposed to {:2.0f}%)'.format(
            100.0 * HYPOTHESES[i],
            100.0 * HYPOTHESES[-1]))
        print('\tMaximum: ({} : 1) after {} flips.'.format(max_odds, max_n))
        print('\tMinimum: ({} : 1) after {} flips.'.format(min_odds, min_n))
        print('\t  Final: ({} : 1) after {} flips.'.format(final_odds, final_n))

def report_threshold_breaches(i, odds_lists):
    assert odds_lists
    extremities = (most_extreme_odds(i, odds_list) for odds_list in odds_lists)
    middle_counters = {t: 0 for t in THRESHOLDS}
    middle_trackers = {t: [] for t in THRESHOLDS}
    final_counters = {t: 0 for t in THRESHOLDS}
    for max_odds, max_n, _, _, final_odds, _ in extremities:
        for threshold in THRESHOLDS:
            if max_odds >= threshold:
                middle_counters[threshold] += 1
                middle_trackers[threshold].append(max_n)
            if final_odds >= threshold:
                final_counters[threshold] += 1
    for threshold in THRESHOLDS:
        print('Odds for {:2.0f}% bias over {:2.0f}% went above ({} : 1) {} times, which was ~{:0.2f}% of the time'.format(
            100.0 * HYPOTHESES[i],
            100.0 * HYPOTHESES[-1],
            threshold,
            middle_counters[threshold],
            (100.0 * middle_counters[threshold]) / len(odds_lists)))
        print('\tand ended above ({} : 1) odds {} ({:0.0f}%) times.'.format(
            threshold,
            final_counters[threshold],
            (100.0 * final_counters[threshold]) / len(odds_lists)))
        print('\tthreshold max happened at around toss number {:0.2f}, on average.'.format(
            sum(middle_trackers[threshold]) / len(middle_trackers[threshold])))

def run():
    odds_lists = [list(odds_sequence(gen_sequence())) for _ in range(RUNS)]
    for i in range(len(HYPOTHESES) - 1):
        report_threshold_breaches(i, odds_lists)
        print('')

run()

Odds for 75% bias over 50% went above (20 : 1) 391 times, which was ~3.91% of the time
	and ended above (20 : 1) odds 0 (0%) times.
	threshold max happened at around toss number 31.48, on average.
Odds for 75% bias over 50% went above (100 : 1) 78 times, which was ~0.78% of the time
	and ended above (100 : 1) odds 0 (0%) times.
	threshold max happened at around toss number 44.14, on average.

Odds for 55% bias over 50% went above (20 : 1) 133 times, which was ~1.33% of the time
	and ended above (20 : 1) odds 45 (0%) times.
	threshold max happened at around toss number 244.59, on average.
Odds for 55% bias over 50% went above (100 : 1) 7 times, which was ~0.07% of the time
	and ended above (100 : 1) odds 1 (0%) times.
	threshold max happened at around toss number 269.00, on average.

