# Introduction: Data Creation

In this notebook we create an example dataset to be used for automated feature engineering. I have included this code in the repository for posterity and because at some point it may come in use for generating additional example datasets.

In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import random

rand_dates = []
 
for _ in range(1000):
  
  year = random.choice(range(2000, 2015))
  month = random.choice(range(1, 13))
  day = random.choice(range(1, 29))
  rdate = datetime(year, month, day)
  rand_dates.append(rdate)

In [2]:
clients = pd.DataFrame(columns = ['client_id', 'joined', 'income', 'credit_score'])
for _ in range(25):
  clients = clients.append(pd.DataFrame({'client_id': np.random.randint(25000, 50000, size = 1)[0], 'joined': random.choice(rand_dates),
                           'income': np.random.randint(30500, 240000, size = 1)[0], 'credit_score': np.random.randint(500, 850, size = 1)[0]},
                                        index = [0]), ignore_index = True)

clients.head()

Unnamed: 0,client_id,joined,income,credit_score
0,26579,2014-10-01,94457,597
1,36238,2010-04-04,135768,795
2,46134,2006-02-05,143127,791
3,32364,2004-12-05,186151,769
4,25116,2011-03-12,169048,655


In [8]:
loans = pd.DataFrame(columns = ['client_id', 'loan_type', 'loan_amount', 'repaid',
                                         'loan_id', 'loan_start', 'loan_end', 'rate'])

for client in clients['client_id'].unique():
    for _ in range(20):
        time_created = pd.datetime(np.random.randint(2000, 2015, size = 1)[0],
                               np.random.randint(1, 13, size = 1)[0],
                               np.random.randint(1, 30, size = 1)[0])

    time_ended = time_created + pd.Timedelta(days = np.random.randint(500, 1000, size = 1)[0])

    loans = loans.append(pd.DataFrame({'client_id': client, 'loan_type': random.choice(['cash', 'credit', 'home', 'other']),
                                                         'loan_amount': np.random.randint(500, 15000, size = 1)[0],
                                                         'repaid': random.choice([0, 1]), 
                                                         'loan_id': np.random.randint(10000, 12000, size = 1)[0],
                                                         'loan_start': time_created,
                                                         'loan_end': time_ended,
                                                          'rate': round(abs(4 * np.random.randn(1)[0]), 2)}, index = [0]), ignore_index = True)


In [9]:
loans.head()

Unnamed: 0,client_id,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate
0,26579,other,12869,0,11009,2000-09-14,2002-08-15,3.43
1,36238,credit,2852,1,11962,2006-07-03,2009-01-02,3.77
2,46134,home,1967,1,11985,2012-12-19,2014-11-26,0.55
3,32364,cash,14380,0,10721,2002-02-15,2003-11-13,0.22
4,25116,cash,12833,1,11000,2014-11-20,2016-10-03,0.13


In [10]:
payments = pd.DataFrame(columns = ['loan_id', 'payment_amount', 
                                    'payment_date', 'missed'])

for _, row in loans.iterrows():
  time_created = row['loan_start']
  payment_date = time_created + pd.Timedelta(days = 30)
  loan_amount = row['loan_amount']
  loan_id = row['loan_id']
  payment_id = np.random.randint(10000, 12000, size = 1)[0]
  for _ in range(np.random.randint(5, 10, size = 1)[0]):
    payment_id += 1
    payment_date += pd.Timedelta(days = np.random.randint(10, 50, size = 1)[0])
    payments = payments.append(pd.DataFrame({'loan_id': loan_id, 
                                                               'payment_amount': np.random.randint(int(loan_amount / 10), int(loan_amount / 5), size = 1)[0],
                                                               'payment_date': payment_date, 'missed': random.choice([0, 1])}, index = [0]), ignore_index = True)
    

In [11]:
payments.head()

Unnamed: 0,loan_id,payment_amount,payment_date,missed
0,11009,2189,2000-11-29,1
1,11009,1979,2001-01-08,0
2,11009,2048,2001-02-18,0
3,11009,2050,2001-04-07,0
4,11009,1852,2001-04-25,1


In [12]:
clients = clients.drop_duplicates(subset = 'client_id')
loans = loans.drop_duplicates(subset = 'loan_id')


clients.to_csv('clients.csv', index = False)
loans.to_csv('loans.csv', index = False)
payments.to_csv('payments.csv', index = False)