# Generating an artificial dataset
## Jose M Albornoz
### December 2018

This notebook generates an artificial dataset for future experimentation; the idea is to have a greater control about features and their correlations.

In [1]:
import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt
# from matplotlib import style
# from matplotlib import cm
# style.use('fivethirtyeight')
# %matplotlib inline

import random
import time

from operator import add, sub

# maximum number of rdataframe ows and columns displayed
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

pd.options.mode.chained_assignment = None

RANDOM_STATE = 801
random.seed(RANDOM_STATE)

# 1.- Number of data items

In [2]:
N = 1000

# 2.- Define blood types

In [3]:
bloodTypes = ['A', 'A', 'A', 'O', 'O', 'O', 'AB', 'B'] 

In [4]:
bloodType = bloodTypes * (N//len(bloodTypes))

In [5]:
from random import shuffle
shuffle(bloodType)
shuffle(bloodType)

# 3.- Define age

In [6]:
age_min = 18

In [7]:
age_max = 65

In [8]:
age = np.random.uniform(low=age_min, high=age_max, size=N)

# 3.- Define healthy eating variable

In [9]:
healthy_eating = np.random.normal(loc=5, scale=2, size=N)

In [10]:
healthy_eating = [round(i) for i in healthy_eating]

In [11]:
for i in range(len(healthy_eating)):
    if healthy_eating[i] > 9:
        healthy_eating[i] = 9
    if healthy_eating[i] < 0:
        healthy_eating[i] = 0

# 3.- Define active lifestyle variable

In [12]:
active_lifestyle = np.random.normal(loc=5, scale=2, size=N)

In [13]:
active_lifestyle = [round(i) for i in active_lifestyle]

In [14]:
# introduce non-linearity
for i in range(len(active_lifestyle)):
    if age[i] < 30:
        active_lifestyle[i] += 1       

In [15]:
for i in range(len(active_lifestyle)):
    if active_lifestyle[i] > 9:
        active_lifestyle[i] = 9
    if active_lifestyle[i] < 0:
        active_lifestyle[i] = 0

# 4.- Define salary variable

In [16]:
salary = [20000 + (3.0*i)**2 for i in age]

In [17]:
bonus = [500*i for i in healthy_eating]
salary = list(map(add, salary, bonus))

In [18]:
penalty = [300*i for i in active_lifestyle]
salary = list(map(sub, salary, penalty))

In [19]:
# add noise
noise = np.random.uniform(low=0, high=5000, size=N)
salary = list(map(add, salary, noise))

# 5.- Create dataframe and save data

In [20]:
df = pd.DataFrame({'id':range(N), 'bloodType':bloodType, 'age':age, \
                   'healthy_eating':healthy_eating, 'active_lifestyle':active_lifestyle, 'salary':salary})

In [21]:
df.head()

Unnamed: 0,id,bloodType,age,healthy_eating,active_lifestyle,salary
0,0,A,33.741012,5.0,7.0,34099.525572
1,1,O,46.110061,4.0,6.0,43770.999909
2,2,B,37.318695,4.0,3.0,37496.334083
3,3,O,19.069138,9.0,3.0,28278.922197
4,4,O,47.405131,5.0,5.0,45005.237724


In [22]:
df.to_csv('artificial_data.csv', index=False)