In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline  

In [2]:
def calc_donation_lapse(df):

    df_sort = df.sort_values(by=['arc_id', 'donation_dt'])
    
    last_arc_id = None
    donate_times = 1
    last_date = None

    dt_format = '%Y-%m-%d'
    for i, row in df_sort.iterrows():

        this_arc_id = row['arc_id']
        this_dt = datetime.strptime(row['donation_dt'], dt_format)

        if this_arc_id != last_arc_id:
            donate_times = 1
            df_sort.set_value(i, 'donate_times', donate_times)
            df_sort.set_value(i, 'lag_in_days', 0)

        else:
            donate_times += 1
            df_sort.set_value(i, 'donate_times', donate_times)
            df_sort.set_value(i, 'lag_in_days', (this_dt - last_dt).days)

        last_arc_id = this_arc_id
        last_dt = this_dt
        
    return df_sort

Import the dataset

In [12]:
# Directory and file names
pname = '../data/'
fname1 = 'summary_first_1k.csv'
fname2 = 'state13_first_1k.csv'

data1 = pd.read_csv(pname+fname)
data2 = pd.read_csv(pname+fname2)

In [4]:
data.head(3)

Unnamed: 0,bzd_assessedhomevalue,bzd_avg_bank_credit6,bzd_avg_inq_all12,bzd_avg_maxcred_install6,bzd_avg_mos_autopay,bzd_avg_numauto12,bzd_descretionary_spend,bzd_income,bzd_lengthofresidence,bzd_mortg_equity,...,fst_crs_cmptn_dt,last_crs_cmptn_dt,lftm_hrs_vol_cnt,last_vol_dt,first_vol_dt,zip5,zip4,gender,race,zip5c
0,,,,,,,,,,,...,,,,,,68516,2357.0,F,,68516
1,,11815.0,1.18,17573.67,558.83,0.071,,,,,...,,,,,,90040,1224.0,F,Hispanic,90040
2,,,,,,,,,,,...,,,,,,34981,3406.0,F,Caucasian,34981


In [5]:
c_names = data.columns

In [6]:
missing_val_cnt = pd.Series(data=[data.shape[0]-data[c].dropna().shape[0] for c in c_names], index=c_names, dtype=int)
missing_val_pct = pd.Series(data=[(data.shape[0]-data[c].dropna().shape[0])/data.shape[0] for c in c_names], index=c_names)

In [7]:
missing_val_stats = pd.DataFrame([missing_val_cnt, missing_val_pct], index=['count', 'percentage']).transpose()

In [9]:
# columns that has less than 50% missing values
missing_val_stats[missing_val_stats['percentage']<0.7]

Unnamed: 0,count,percentage
bzd_assessedhomevalue,612.0,0.612613
bzd_avg_bank_credit6,339.0,0.339339
bzd_avg_inq_all12,339.0,0.339339
bzd_avg_maxcred_install6,339.0,0.339339
bzd_avg_mos_autopay,339.0,0.339339
bzd_avg_numauto12,339.0,0.339339
bzd_income,612.0,0.612613
bzd_mortg_equity,612.0,0.612613
bzd_numberadultsinhh,645.0,0.645646
bzd_realty_mospay,612.0,0.612613


In [10]:
data[['donation_type']].drop_duplicates()

KeyError: "['donation_type'] not in index"

In [11]:
data_with_lapse = calc_donation_lapse(data)

KeyError: 'donation_dt'

In [None]:
data_with_lapse[['arc_id', 'donation_dt', 'donate_times', 'lag_in_days']].head(10)

In [None]:
lapse_all = pd.Series(map(lambda x: x/365, data_with_lapse[data_with_lapse['lag_in_days']!=0]['lag_in_days']))

In [None]:
# Histogram for all time differece between donations
axes = plt.gca()
axes.set_xlim([0,10])
plt.xticks([v*0.5 for v in range(0,21)])
plt.ylabel('Counts')
plt.xlabel('Years')
lapse_all.hist(bins=300)

In [None]:
lapse_1_2 = pd.Series(map(lambda x: x/365, data_with_lapse[data_with_lapse['donate_times']==2]['lag_in_days']))

In [None]:
# Histogram for all time differece between first and second donations
axes = plt.gca();
axes.set_xlim([0,10]);
plt.xticks([v*0.5 for v in range(0,21)])
plt.ylabel('Counts')
plt.xlabel('Years')
lapse_1_2.hist(bins=300)

By the histograms, we can assume that it has higher possibility to retain the donors if they came back to donate within 1 year.

In [None]:
freq_data = data_with_lapse[['arc_id','freq']].drop_duplicates()
delta_data = data_with_lapse[data_with_lapse['donate_times'] == 2][['arc_id', 'lag_in_days']]
freq_vs_delta = pd.merge(freq_data, delta_data, how='left', left_on='arc_id', right_on='arc_id')
freq_vs_delta.fillna(value=0)
freq_vs_delta['year_since_last_donation'] = freq_vs_delta['lag_in_days']/365

In [None]:
# Scatter plot for repeat donors
plt.figure()
freq_vs_delta[freq_vs_delta['freq'] > 1].sample(1000).plot.scatter(x='freq', y='year_since_last_donation')
plt.xlabel('Counts of Donations')
plt.ylabel('Years Between 1st and 2nd Donations')
plt.show()

----------------------------

# Whole Blood Donors

In [None]:
whole_data = data[data['donation_type'] == 'Whole blood']

In [None]:
whole_data_with_lapse = calc_donation_lapse(whole_data)

In [None]:
whole_lapse = pd.Series(map(lambda x: x/56, 
                            whole_data_with_lapse[whole_data_with_lapse['lag_in_days']!=0]['lag_in_days']))

In [None]:
whole_lapse_1_2 = pd.Series(map(lambda x: x/56, 
                                whole_data_with_lapse[whole_data_with_lapse['donate_times']==2]['lag_in_days']))

In [None]:
whole_lapse_gt_2 = pd.Series(map(lambda x: x/56, 
                                 whole_data_with_lapse[whole_data_with_lapse['donate_times']>=2]['lag_in_days']))

In [None]:
# Histogram for time lapse between donations
plt.figure(1, figsize=(15, 5))
plt.subplot(131)
axes = plt.gca()
axes.set_xlim([0,20])
plt.xticks(range(21))
plt.ylabel('Counts')
plt.xlabel('Cycles(56 days/cycle)')
whole_lapse.hist(bins=180)

# Histogram for time lapse between 1st and 2nd donations
plt.subplot(132)
axes = plt.gca()
axes.set_xlim([0,20])
plt.xticks(range(21))
plt.xlabel('Cycles btw. 1st and 2nd donations')
whole_lapse_1_2.hist(bins=180)

# Histogram for time lapse between donations after second donation
plt.subplot(133)
axes = plt.gca()
axes.set_xlim([0,20])
plt.xticks(range(21))
plt.xlabel('Cycles btw. Donations after 2nd')
whole_lapse_gt_2.hist(bins=180)

In [None]:
# Donors that donate again in 56 days
abn_donors_id = whole_data_with_lapse[(whole_data_with_lapse.lag_in_days < 56) & 
                                      (whole_data_with_lapse.lag_in_days != 0)
                                     ][['arc_id']]
abn_donors_df = pd.merge(abn_donors_id, whole_data_with_lapse, left_on='arc_id', right_on='arc_id')

In [None]:
abn_donors_df[['arc_id', 'donation_type','donation_dt', 
               'donation_ind','donate_times', 'lag_in_days', 'age_at_donation']].head(8)