In [1]:
from timeit import default_timer as timer
import pandas as pd
import scipy.stats
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import calendar
import time
from itertools import permutations
%matplotlib inline
start_time = timer()

In [2]:
print(f'Duration: {(timer() - start_time) / 60: 0.2f} minutes, or{(timer() - start_time): 0.0f} seconds')

Duration:  0.00 minutes, or 0 seconds


In [3]:
data = pd.read_csv('applications data.csv')

In [4]:
data.head()

Unnamed: 0,record,date,ssn,firstname,lastname,address,zip5,dob,homephone,fraud_label
0,1,20160101,379070012,XRRAMMTR,SMJETJMJ,6861 EUTST PL,2765,19070626,1797504115,0
1,2,20160101,387482503,MAMSTUJR,RTTEMRRR,7280 URASA PL,57169,19340615,4164239415,1
2,3,20160101,200332444,SZMMUJEZS,EUSEZRAE,5581 RSREX LN,56721,19070626,216537580,0
3,4,20160101,747451317,SJJZSXRSZ,ETJXTXXS,1387 UJZXJ RD,35286,19440430,132144161,0
4,5,20160101,24065868,SSSXUEJMS,SSUUJXUZ,279 EAASA WY,3173,19980315,6101082272,0


In [5]:
data['date'] = pd.to_datetime(data['date'],format='%Y%m%d')

In [6]:
data['dob_date'] = pd.to_datetime(data['dob'],format='%Y%m%d')

In [7]:
data['zip5'] = data['zip5'].astype('str').str.zfill(5)

### Handling Frivolous values

In [8]:
#address
data.loc[data['address']=='123 MAIN ST','address'] = data.loc[data['address']=='123 MAIN ST','record'].astype('str') + " Record" 

In [9]:
#dob
data.loc[data['dob']==19070626,'dob'] = -data.loc[data['dob']==19070626]['record'].astype(int)
# data['dob'] = data['dob'].apply(lambda x: '0:0>8'.format(x))
data['dob'] = data['dob'].astype(str).str.zfill(8)

In [10]:
data['dob']

0         000000-1
1         19340615
2         000000-3
3         19440430
4         19980315
            ...   
999995    19550418
999996    19150624
999997    19880628
999998    20120302
999999    19820826
Name: dob, Length: 1000000, dtype: object

In [11]:
#ssn
data.loc[data['ssn']==999999999,'ssn'] = -data.loc[data['ssn']==999999999,'record'].astype(int)
# data['homephone'] = data['homephone'].apply(lambda x: '0:0>10'.format(x))
data['ssn'] = data['ssn'].astype(str).str.zfill(9)

In [12]:
#homephone
data.loc[data['homephone']==9999999999,'homephone'] = -data.loc[data['homephone']==9999999999,'record'].astype(int)
# data['homephone'] = data['homephone'].apply(lambda x: '0:0>10'.format(x))
data['homephone'] = data['homephone'].astype(str).str.zfill(10)

In [13]:
data['dow'] = data['date'].apply(lambda x : calendar.day_name[x.weekday()])

In [14]:
train_test = data[data['date']<pd.to_datetime('2016-11-01')]
len(train_test)

833507

In [15]:
#smoothing
c = 4; nmid = 20; y_avg = train_test['fraud_label'].mean()
y_dow = train_test.groupby('dow')['fraud_label'].mean()
num = train_test.groupby('dow').size()
y_dow

dow
Friday       0.014499
Monday       0.013480
Saturday     0.014968
Sunday       0.013674
Thursday     0.014981
Tuesday      0.014070
Wednesday    0.015169
Name: fraud_label, dtype: float64

In [16]:
train_test.groupby('dow').size()
y_avg
y_dow

dow
Friday       0.014499
Monday       0.013480
Saturday     0.014968
Sunday       0.013674
Thursday     0.014981
Tuesday      0.014070
Wednesday    0.015169
Name: fraud_label, dtype: float64

In [17]:
y_dow_smooth = y_avg + (y_dow - y_avg)/( 1 + np.exp(-(num - nmid)/c))
y_dow_smooth

dow
Friday       0.014499
Monday       0.013480
Saturday     0.014968
Sunday       0.013674
Thursday     0.014981
Tuesday      0.014070
Wednesday    0.015169
dtype: float64

In [18]:
y_dow

dow
Friday       0.014499
Monday       0.013480
Saturday     0.014968
Sunday       0.013674
Thursday     0.014981
Tuesday      0.014070
Wednesday    0.015169
Name: fraud_label, dtype: float64

In [19]:
data['dow_risk'] = data['dow'].map(y_dow_smooth)

In [20]:
y_dow = y_dow.reset_index()
cats = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
y_dow['dow'] = pd.Categorical(y_dow['dow'],categories=cats,ordered = True)
y_dow = y_dow.sort_values('dow').set_index('dow')
y_dow

Unnamed: 0_level_0,fraud_label
dow,Unnamed: 1_level_1
Monday,0.01348
Tuesday,0.01407
Wednesday,0.015169
Thursday,0.014981
Friday,0.014499
Saturday,0.014968
Sunday,0.013674


In [21]:
data.columns

Index(['record', 'date', 'ssn', 'firstname', 'lastname', 'address', 'zip5',
       'dob', 'homephone', 'fraud_label', 'dob_date', 'dow', 'dow_risk'],
      dtype='object')

In [22]:
data['name'] = data['firstname'] + data['lastname']
data['fulladdress'] = data['address'] + data['zip5']
data['name_dob'] = data['name'] + data['dob']
data['name_fulladdress'] = data['name'] + data['fulladdress']
data['name_homephone'] = data['name'] + data['homephone']
data['fulladdress_dob'] = data['fulladdress'] + data['dob']
data['fulladdress_homephone'] = data['fulladdress'] + data['homephone']
data['dob_homephone'] = data['dob'] + data['homephone']
data['homephone_name_dob'] =  data['homephone'] + data['name'] + data['dob']



In [23]:
data.columns

Index(['record', 'date', 'ssn', 'firstname', 'lastname', 'address', 'zip5',
       'dob', 'homephone', 'fraud_label', 'dob_date', 'dow', 'dow_risk',
       'name', 'fulladdress', 'name_dob', 'name_fulladdress', 'name_homephone',
       'fulladdress_dob', 'fulladdress_homephone', 'dob_homephone',
       'homephone_name_dob'],
      dtype='object')

In [24]:
data['ssn_firstname'] = data['ssn'] + data['firstname']
data['ssn_lastname'] = data['ssn'] + data['lastname']
data['ssn_dob'] = data['ssn'] + data['dob']
data['ssn_fulladdress'] = data['ssn'] + data['fulladdress']
data['ssn_homephone'] = data['ssn'] + data['homephone']
data['ssn_address'] = data['ssn'] + data['address']
data['ssn_zip5'] = data['ssn'] + data['zip5']
data['ssn_name'] = data['ssn'] + data['name']
data['ssn_name_dob'] = data['ssn'] + data['name'] + data['dob']


In [25]:
data.columns

Index(['record', 'date', 'ssn', 'firstname', 'lastname', 'address', 'zip5',
       'dob', 'homephone', 'fraud_label', 'dob_date', 'dow', 'dow_risk',
       'name', 'fulladdress', 'name_dob', 'name_fulladdress', 'name_homephone',
       'fulladdress_dob', 'fulladdress_homephone', 'dob_homephone',
       'homephone_name_dob', 'ssn_firstname', 'ssn_lastname', 'ssn_dob',
       'ssn_fulladdress', 'ssn_homephone', 'ssn_address', 'ssn_zip5',
       'ssn_name', 'ssn_name_dob'],
      dtype='object')

In [26]:
attributes = [ 'ssn',  'address', 'zip5',
       'dob', 'homephone','name', 'fulladdress', 'name_dob', 'name_fulladdress', 'name_homephone',
       'fulladdress_dob', 'fulladdress_homephone', 'dob_homephone',
       'homephone_name_dob', 'ssn_firstname', 'ssn_lastname', 'ssn_dob',
       'ssn_fulladdress', 'ssn_homephone', 'ssn_address', 'ssn_zip5',
       'ssn_name', 'ssn_name_dob'] #'firstname', 'lastname',

In [27]:
len(attributes)

23

### Feature engineering

In [28]:
data1=data.copy()
vars=data.copy()
data1['check_date']=data1.date
data1['check_record']=data1.record
vars.shape

(1000000, 31)

In [29]:
data['homephone'].nunique()

106755

In [None]:
#days_since and velocity
count_days_since = 0
st = time.time()
for e in attributes:
    d1=data1[['record','date',e]]
    dr=data1[['check_record','check_date',e]]
    temp=pd.merge(d1,dr,left_on=e,right_on=e)

    day_since=temp[temp.record>temp.check_record][['record','date','check_date']]\
                .groupby('record')[['date','check_date']].last()
    

    mapper=(day_since.date-day_since.check_date).dt.days
    vars[e+'_day_since']=vars.record.map(mapper)
    vars[e+'_day_since'].fillna(365,inplace=True)
    count_days_since = count_days_since + 1

print(f'Total run time for days since: {(time.time() - st) / 60}min') 
print(f'Total variables for days since: {count_days_since}') 


st = time.time()
count_velocity_vars = 0
for e in attributes:
    d1=data1[['record','date',e]]
    dr=data1[['check_record','check_date',e]]
    temp=pd.merge(d1,dr,left_on=e,right_on=e)
    for t in [0,1,3,7,14,30]:
        count_day=temp[(temp.check_date >= (temp.date-dt.timedelta(t)))
                        & (temp.record >= temp.check_record)]
        col_name=f'{e}_count_{t}'
        mapper2=count_day.groupby('record')[e].count()
        vars[col_name]=vars.record.map(mapper2)
        count_velocity_vars = count_velocity_vars + 1

print(f'Total run time: {(time.time() - st) / 60}min')
print(f'Total variables for velocity: {count_velocity_vars}') 

Total run time for days since: 1.1661051988601685min
Total variables for days since: 23


In [None]:
#storing the dataframe in pickle form
vars.to_pickle("vars_velocity_days_since")

In [None]:
vars.columns

In [None]:
#professor removed some elements from attributes, but I have used attributesFull everywhere
attributesFull = attributes.copy()
attributes.remove('address')
attributes.remove('dob')
attributes.remove('name')
attributes.remove('ssn_address')
attributes.remove('homephone')
attributes.remove('name_homephone')
attributes.remove('fulladdress_homephone')
attributes.remove('homephone_name_dob')
attributes.remove('ssn_homephone')
attributes.remove('ssn_firstname')
attributes.remove('ssn_dob')

In [None]:
#there is a difference of 11 variables
len(attributesFull)- len(attributes)

In [None]:
count_relative_velocity_vars = 0
st = time.time()
for t in attributesFull:
    for d in ['0','1']:
        for dd in ['3','7','14','30']:
            vars[t+'_count_'+d+'_by_'+dd]= vars[t+'_count_'+d]/(vars[t+'_count_'+dd]/float(dd))
            count_relative_velocity_vars = count_relative_velocity_vars + 1
print(f'Total run time: {(time.time() - st) / 60}min')
print(f'Total variables for relative velocity: {count_relative_velocity_vars}') 
            

In [None]:
vars.to_pickle("vars_velocity_days_since_relative_velocity")

In [None]:
vars.columns

In [None]:
data['age'] = (data['date'] - data['dob_date']).dt.days/365

In [None]:
data.groupby('ssn')['age'].mean().value_counts()

In [None]:
#dropping all the columns in vars except record to have more space
vars = vars[['record']]
vars

In [None]:
with open("combinations.pickle","rb") as file_to_read:
    combDict = pickle.load(file_to_read)


In [None]:
combDict['comb1']

In [None]:
#count by entities first time ( About 1500 variables)

begin = time.time()
count_count_entities_vars = 0
for (entity,field) in combDict['comb1']:
    st = time.time()
    df_c = data1[['record','date',entity]]
    df_d = data1[['check_record','check_date',entity,field]]
    temp = pd.merge(df_c,df_d,on = entity)
    for offset_t in [1,3,7,14,30,60]:
        count_dat_df = temp[(temp.check_date >= temp.date - dt.timedelta(offset_t)) & (temp.record >=temp.check_record)]
        col_name = f'{entity}_unique_count_for_{field}_{offset_t}'
        mapper = count_dat_df.groupby(['record'])[field].nunique()
        vars[col_name] = vars.record.map(mapper)
        count_count_entities_vars = count_count_entities_vars + 1
    
    print(f'Run time for entity {entity} in field {field}-----------------{time.time() - st}s')

print(f'Total run time{(time.time()-begin)/60} min')
print(f'Total count by entities variables {count_count_entities_vars}')
        

In [None]:
vars.to_pickle("vars_count_entities_1")

In [None]:
y = vars['record']

In [None]:
del vars

In [None]:
vars = pd.DataFrame({'record':y})

In [None]:
#count by entities second time (About 400 variables)
begin = time.time()
count_count_entities_vars = 0
for (entity,field) in combDict['comb2']:
    st = time.time()
    df_c = data1[['record','date',entity]]
    df_d = data1[['check_record','check_date',entity,field]]
    temp = pd.merge(df_c,df_d,on = entity)
    for offset_t in [1,3,7,14,30,60]:
        count_dat_df = temp[(temp.check_date >= temp.date - dt.timedelta(offset_t)) & (temp.record >=temp.check_record)]
        col_name = f'{entity}_unique_count_for_{field}_{offset_t}'
        mapper = count_dat_df.groupby(['record'])[field].nunique()
        vars[col_name] = vars.record.map(mapper)
        count_count_entities_vars = count_count_entities_vars + 1
    
    print(f'Run time for entity {entity} in field {field}-----------------{time.time() - st}s')

print(f'Total run time{(time.time()-begin)/60} min')
print(f'Total count by entities variables {count_count_entities_vars}')
        

In [None]:
vars.to_pickle("vars_count_entities_2")

In [None]:
vars = pd.DataFrame(vars['record'])
vars

In [None]:
#count by entities third time (About 1100 variables)
begin = time.time()
count_count_entities_vars = 0
for (entity,field) in combDict['comb3']:
    st = time.time()
    df_c = data1[['record','date',entity]]
    df_d = data1[['check_record','check_date',entity,field]]
    temp = pd.merge(df_c,df_d,on = entity)
    for offset_t in [1,3,7,14,30,60]:
        count_dat_df = temp[(temp.check_date >= temp.date - dt.timedelta(offset_t)) & (temp.record >=temp.check_record)]
        col_name = f'{entity}_unique_count_for_{field}_{offset_t}'
        mapper = count_dat_df.groupby(['record'])[field].nunique()
        vars[col_name] = vars.record.map(mapper)
        count_count_entities_vars = count_count_entities_vars + 1
    
    print(f'Run time for entity {entity} in field {field}-----------------{time.time() - st}s')

print(f'Total run time{(time.time()-begin)/60} min')
print(f'Total count by entities variables {count_count_entities_vars}')

In [None]:
vars.to_pickle("vars_count_entities_3")

In [None]:
#Age variables

In [None]:
vars = data.copy()

In [None]:
st = time.time()
#ensure vars is sorted
vars = vars.sort_values(by = ['record','date'])
cols_age_vars = ['ssn','ssn_homephone','ssn_name','ssn_zip5','name_fulladdress','ssn_address','name_homephone']
for i in cols_age_vars:
    vars['previous_age_diff_'+i] = vars['age'] - vars.groupby([i])['age'].shift()
    vars['previous_age_diff_'+i] = vars['previous_age_diff_'+i].fillna(0)
print(f"Time to create these variables {(time.time() - st)/60} min")

In [None]:
vars.columns

In [None]:
vars.filter(['record','age', 'previous_age_diff_ssn',
       'previous_age_diff_ssn_homephone', 'previous_age_diff_ssn_name',
       'previous_age_diff_ssn_zip5', 'previous_age_diff_name_fulladdress',
       'previous_age_diff_ssn_address', 'previous_age_diff_name_homephone']).to_pickle("vars_age_entities")

#### End

Following is for my reference

In [None]:
# comb1 = []
# for (entity,field) in permutations(attributesFull,2):
#     for offset_t in [1,3,7,14,30,60]:
#         col_name = f'{entity}_unique_count_for_{field}_{offset_t}'
#         if col_name in cols1:
#             comb1.append((entity,field))

In [None]:
# comb2 = []
# for (entity,field) in permutations(attributesFull,2):
#     for offset_t in [1,3,7,14,30,60]:
#         col_name = f'{entity}_unique_count_for_{field}_{offset_t}'
#         if col_name in cols2:
#             comb2.append((entity,field))

In [None]:
# test3 = pd.read_pickle("vars_count_entities_3")

In [None]:
# cols3 = test3.columns

In [None]:
# comb3 = []
# for (entity,field) in permutations(attributesFull,2):
#     for offset_t in [1,3,7,14,30,60]:
#         col_name = f'{entity}_unique_count_for_{field}_{offset_t}'
#         if col_name in cols3:
#             comb3.append((entity,field))

In [None]:
# import pickle
# combDict = {'comb1':list(set(comb1)),'comb2':list(set(comb2)),'comb3':[i for i in list(set(comb3)) if i!=('ssn_homephone', 'ssn_fulladdress')]}
# file_to_write = open("combinations.pickle", "wb")
# pickle.dump(combDict, file_to_write)

In [None]:
# with open("combinations.pickle","rb") as file_to_read:
#     combDict2 = pickle.load(file_to_read)