# Import Data

In [50]:
import pandas as pd
pd.options.display.max_columns = 900
df = pd.read_csv('20170103_features.csv', parse_dates=[40])

# Create Date Features

In [51]:
df['created_month'] = df.created_at.apply(lambda x: x.month)
df['created_year'] = df.created_at.apply(lambda x: x.year)
df['created_weekday_name'] = df.created_at.apply(lambda x: x.weekday_name)
df['created_is_month_start'] = df.created_at.apply(lambda x: int(x.is_month_start))
df['created_is_month_end'] = df.created_at.apply(lambda x: int(x.is_month_end))
df['created_weekofyear'] = df.created_at.apply(lambda x: x.weekofyear)

Preview dataset

In [3]:
df.head(2)

Unnamed: 0,id,borrower_id,usd_amount,length(loans.proposal),length(about_me),length(about_business),length(address_instructions),missing_natl_id,missing_referred_by,application_time,default_flag,fraud_flag,friends_count,country_id,name,category_id,invited_flag,made_pmts,missed_pmts,sift_science_score,english_flag,business_years,reserve_fee_pct,sift_labeled_bad,about_me_field,about_business_field,proposal_field,ip_user_count,ip_borrowers_defaulted,ip_defaulted_ratio,ip_borrowers_fraudulent,ip_fraudulent_ratio,invited,parent_age_at_invite,parent_comment_cnt,parent_comment_char_cnt,parent_avg_char_per_comment,parent_repay_ratio,parent_usd_total_amount,disbursed_at,created_at,usd_installment_amount,prior_loans,country_internet_users,country_life_expectancy,country_literacy,country_gdppc,created_month,created_year,created_weekday_name,created_is_month_start,created_is_month_end,created_weekofyear
0,5028,15316,250.0,356,595.0,529.0,32,0,1,,0,0,1307.0,32,Kenya,,1,119.0,0.0,,1,,0.0,0,My name is John Wanyoike. I was born in 1985...,"Since 2011, I have been doing my IT technical ...",Thank you for giving me this opportunity to ap...,,,,,,1.0,321.0,62.0,38573.0,622.0,1.000054,412.0,1/28/2014 23:00,2014-01-10 08:47:00,,0.0,46.0,62.0,78.0,1429.0,1,2014,Friday,0,0,2
1,4866,15063,250.0,473,882.0,1082.0,206,0,1,,1,0,151.0,32,Kenya,,1,28.0,56.0,,1,,0.0,1,My\r\n names are tom mwaura from narok town an...,I\r\n am a businessman whose business deals in...,The first loan given will be used to aid in re...,,,,,,1.0,352.0,9.0,1447.0,161.0,1.000131,89.0,1/26/2014 23:00,2014-01-02 06:35:00,,0.0,46.0,62.0,78.0,1429.0,1,2014,Thursday,0,0,1


# Create Date Partition For Train, Validation, Holdout

In [68]:
def custom_partition(dt):
    if dt >= pd.to_datetime('2016-05-15'):
        return 'h'
    elif ((dt >= pd.to_datetime('2016-03-01')) & (dt < pd.to_datetime('2016-05-15'))):
        return 'v'
    else:
        return 't'

In [69]:
df['part'] = df.created_at.apply(custom_partition)

Check partition statistics

In [70]:
df.groupby('part')['created_at'].agg(['min', 'max', 'count'])

Unnamed: 0_level_0,min,max,count
part,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
h,2016-05-15 00:41:00,2016-07-24 06:34:00,2272
t,2014-01-01 03:51:00,2016-02-29 23:21:00,18579
v,2016-03-01 00:19:00,2016-05-14 18:35:00,2210


# Send To DataRobot

In [74]:
import datarobot as dr
from datarobot import UserTVH
#Create custom partition object
part = UserTVH(user_partition_col = 'part', training_level = 't', 
               validation_level ='v', holdout_level ='h', seed=0)

In [75]:
droplist = ['missed_pmts', 'nonfraud_default', 'sift_labeled_bad', 'sift_science_score', 
                'made_pmts', 'fraud_flag', 'reserve_fee_pct','country_id', 'created_at']

def send_to_datarobot(df, proj_name, cols_to_drop, part):
    """
    helper function that takes data and uploads to DataRobot
    """
    df = df.copy()
    #drop irrelevant columns
    cols_to_drop = [x for x in cols_to_drop if x in df.columns]
    final_df = df.drop(cols_to_drop, axis=1)
    #Combine the ids to obtain a unique user id
    final_df['unique_id'] = final_df.borrower_id.astype(str) +'__'+ final_df.id.astype(str)
    final_df.drop(['borrower_id', 'id'], axis = 1, inplace=True)
    
    #save file to disk, with compression
    file_path = '/Users/hamelhusain/Google Drive/Team Shared Folder - Zidisha/DR_'+proj_name+'.gz'
    print('Saving {}'.format(file_path))
    final_df.to_csv(file_path, compression = 'gzip', index=False)
    
    print('Beginning DR Upload')
    proj = dr.Project.create(file_path, project_name=proj_name)
    proj.set_target(target = 'default_flag', 
                partitioning_method = part,
                worker_count = 30)
    return proj


In [None]:
proj = send_to_datarobot(df, 'Zidisha_20170103_JasonFeatures_v2', cols_to_drop=droplist, part=part)

Saving /Users/hamelhusain/Google Drive/Team Shared Folder - Zidisha/DR_Zidisha_20170103_JasonFeatures_v2.gz
Beginning DR Upload


### Link To Project

In [None]:
proj.get_leaderboard_ui_permalink()