In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil.relativedelta import relativedelta
%matplotlib inline  

In [17]:
def calc_donation_lapse(df):

    df_sort = df.sort_values(by=['arc_id', 'donation_dt'])
    
    last_arc_id = None
    donate_times = 1
    last_date = None

    dt_format = '%Y-%m-%d'
    for i, row in df_sort.iterrows():
        this_arc_id = row['arc_id']
        this_dt = datetime.strptime(row['donation_dt'], dt_format)
        df_sort.set_value(i, 'repeat_donor_ind', if_repeat_donor(df_sort, this_arc_id))
        
        if this_arc_id != last_arc_id:
            donate_times = 1
            df_sort.set_value(i, 'donate_times', donate_times)
            df_sort.set_value(i, 'lag_in_days', 0)

        else:
            donate_times += 1
            df_sort.set_value(i, 'donate_times', donate_times)
            df_sort.set_value(i, 'lag_in_days', (this_dt - last_dt).days)

        last_arc_id = this_arc_id
        last_dt = this_dt
        
    return df_sort

In [3]:
def cal_age(df):
    df_sort = df.sort_values(by=['arc_id', 'donation_dt'])
    
    last_arc_id = None
    donate_times = 1
    last_date = None
    fst_donation_age = None

    birth_dt_format = '%Y/%m/%d'
    donate_dt_format = '%Y-%m-%d'
    for i, row in df_sort.iterrows():
        if not row['birth_dt'] or type(row['birth_dt']) != str:
            continue
            
        this_arc_id = row['arc_id']
        this_dt = datetime.strptime(row['donation_dt'], donate_dt_format)
        this_birth_dt = datetime.strptime(row['birth_dt'], birth_dt_format)
        this_age = relativedelta(this_dt, this_birth_dt).years

        if this_arc_id != last_arc_id:
            df_sort.set_value(i, 'fst_donation_age', this_age)
            df_sort.set_value(i, 'fst_donation_age_cat', age_categorization(this_age))
            df_sort.set_value(i, 'age_at_donation', this_age)
            df_sort.set_value(i, 'age_at_donation_cat', age_categorization(this_age))
            fst_donation_age = this_age

        else:
            if row['first_donat_ind'] == 0:
                df_sort.set_value(i, 'fst_donation_age', fst_donation_age)
                df_sort.set_value(i, 'fst_donation_age_cat', age_categorization(fst_donation_age))
                df_sort.set_value(i, 'age_at_donation', this_age)
                df_sort.set_value(i, 'age_at_donation_cat', age_categorization(this_age))
            else:
                df_sort.set_value(i, 'fst_donation_age', this_age)
                df_sort.set_value(i, 'fst_donation_age_cat', age_categorization(this_age))
                df_sort.set_value(i, 'age_at_donation', this_age)
                df_sort.set_value(i, 'age_at_donation_cat', age_categorization(this_age))

        last_arc_id = this_arc_id
        last_dt = this_dt
        
    return df_sort

In [4]:
def age_categorization(age):
    if age < 18:
        category = '<18'
    elif age >= 18 and age <=22:
        category = '18-22'
    elif age > 22 and age <=30:
        category = '23-30'
    elif age > 30 and age <=35:
        category = '31-35'
    elif age > 35:
        category = '>35'
    return category

In [5]:
def if_repeat_donor(data, arc_id):
    assert type(data) == pd.DataFrame
    if len(data[data.arc_id == arc_id]) > 1:
        return 1
    else:
        return 0

Import the dataset

In [6]:
# Directory and file names
pname = '/home/data/RedCross/'
fname1 = 'donor_summary912016.csv'
data1 = pd.read_csv(pname+fname1)

In [7]:
fname2 = 'state13.csv'
data2 = pd.read_csv(pname+fname2, encoding='latin-1')

In [8]:
data = pd.DataFrame.merge(data2, data1, how='left', on='arc_id')

In [9]:
data.shape

(2065563, 127)

In [10]:
c_names = data.columns

In [11]:
missing_val_cnt = pd.Series(data=[data.shape[0]-data[c].dropna().shape[0] for c in c_names], index=c_names, dtype=int)
missing_val_pct = pd.Series(data=[(data.shape[0]-data[c].dropna().shape[0])/data.shape[0] for c in c_names], index=c_names)

In [12]:
missing_val_stats = pd.DataFrame([missing_val_cnt, missing_val_pct], index=['count', 'percentage']).transpose()

In [13]:
missing_val_stats.sort_values(by='percentage', ascending=False).to_csv('/home/ysu7/data/missing_val_stats_stats13_summary.csv')

In [None]:
# columns that has less than 50% missing values
missing_val_stats[missing_val_stats['percentage']<0.7]

In [14]:
data[['donation_type']].drop_duplicates()

Unnamed: 0,donation_type
0,Whole Blood
42,Plateletpheresis
56,Red Cell Apheresis
19649,Plasmapheresis


In [15]:
data = cal_age(data)

In [None]:
data[data.deferral_ind == 1].head()

In [None]:
data_with_lapse = calc_donation_lapse(data)

In [None]:
data_with_lapse[['arc_id', 'donation_dt', 'fst_donation_age', 'fst_donation_age_cat',
                 'age_at_donation', 'age_at_donation', 'repeat_donor_ind',
                 'donate_times', 'lag_in_days']].head(10)

In [None]:
lapse_all = pd.Series(map(lambda x: x/365, data_with_lapse[(data_with_lapse['lag_in_days'] != 0) & 
                                                           (data_with_lapse['donation_ind'] == 1)]['lag_in_days']))

In [None]:
# Histogram for all time differece between donations
axes = plt.gca()
axes.set_xlim([0,10])
plt.xticks([v*0.5 for v in range(0,21)])
plt.title('Time Diff. Between Donations')
plt.ylabel('Counts')
plt.xlabel('Years')
lapse_all.hist(bins=300)

In [None]:
lapse_1_2 = pd.Series(map(lambda x: x/365, data_with_lapse[(data_with_lapse['donate_times']==2) & 
                                                           (data_with_lapse['donation_ind'] == 1)]['lag_in_days']))

In [None]:
# Histogram for all time differece between first and second donations
axes = plt.gca();
axes.set_xlim([0,10]);
plt.xticks([v*0.5 for v in range(0,21)])
plt.title('Time Diff. Between 1st & 2nd Donations')
plt.ylabel('Counts')
plt.xlabel('Years')
lapse_1_2.hist(bins=300)

By the histograms, we can assume that it has higher possibility to retain the donors if they came back to donate within 1 year.

----------------------------

# Whole Blood Donors

In [None]:
whole_data = data[data['donation_type'] == 'Whole Blood']

In [None]:
whole_data_with_lapse = calc_donation_lapse(whole_data)

In [None]:
whole_lapse = pd.Series(map(lambda x: x/56, 
                            whole_data_with_lapse[(whole_data_with_lapse['lag_in_days']!=0) & 
                                                  (data_with_lapse['donation_ind'] == 1)]['lag_in_days']))

In [None]:
whole_lapse_1_2 = pd.Series(map(lambda x: x/56, 
                                whole_data_with_lapse[(whole_data_with_lapse['donate_times']==2) &
                                                      (data_with_lapse['donation_ind'] == 1)]['lag_in_days']))

In [None]:
whole_lapse_gt_2 = pd.Series(map(lambda x: x/56, 
                                 whole_data_with_lapse[(whole_data_with_lapse['donate_times']>=2) &
                                                       (data_with_lapse['donation_ind'] == 1)]['lag_in_days']))

In [None]:
# Histogram for time lapse between donations
plt.figure(1, figsize=(15, 5))
plt.subplot(131)
axes = plt.gca()
axes.set_xlim([0,20])
plt.xticks(range(21))
plt.ylabel('Counts')
plt.xlabel('Cycles(56 days/cycle)')
whole_lapse.hist(bins=180)

# Histogram for time lapse between 1st and 2nd donations
plt.subplot(132)
axes = plt.gca()
axes.set_xlim([0,20])
plt.xticks(range(21))
plt.xlabel('Cycles btw. 1st and 2nd donations')
whole_lapse_1_2.hist(bins=180)

# Histogram for time lapse between donations after second donation
plt.subplot(133)
axes = plt.gca()
axes.set_xlim([0,20])
plt.xticks(range(21))
plt.xlabel('Cycles btw. Donations after 2nd')
whole_lapse_gt_2.hist(bins=180)

In [None]:
# Donors that donate again in 56 days
abn_donors_id = whole_data_with_lapse[(whole_data_with_lapse.lag_in_days < 56) & 
                                      (whole_data_with_lapse.lag_in_days != 0)
                                     ]['arc_id']
abn_donors_df = data_with_lapse[data_with_lapse['arc_id'].isin(abn_donors_id)]

In [None]:
len(abn_donors_id)

In [None]:
abn_donors_df[['arc_id', 'donation_type','donation_dt', 'first_donat_ind', 'deferral_ind',
               'donation_ind','donate_times', 'lag_in_days', 'age_at_donation']].head(8)