## Import Data

In [1]:
import pandas as pd
import numpy as np
import random

## Read the sample dataset to work on

In [2]:
df = pd.read_csv("bank_prospects.csv")

#Rename 2 columns to make the data more like account information
df.rename(columns={'Purchased':'Premium','Salary':'Balance'},inplace=True)

df.head()

Unnamed: 0,Age,Balance,Gender,Country,Premium
0,18.0,20000.0,Male,Germany,N
1,19.0,22000.0,Female,France,N
2,20.0,24000.0,Female,England,N
3,21.0,,Male,England,N
4,22.0,50000.0,Male,France,Y


In [3]:
df.dtypes

Age        float64
Balance    float64
Gender      object
Country     object
Premium     object
dtype: object

## Generate random values to increase the size of the data

We generate data using seeds to provide reproducibale outputs

In [None]:
country_array = ["France","Germany","Belgium","England","Spain","Italy","Denmark","Austria","Holland"];
random_data_size = 10000000

for i in range(random_data_size):
    random.seed(52+i)
    bal = random.randrange(5000,55000)
    age = random.randrange(18,75)
    gender = random.choice(["Male","Female"])
    country = random.choice(country_array)
    premium = random.choice(["N","Y"])
    tmp=pd.DataFrame([{'Age':age,'Balance':bal,'Gender':gender,'Country':country,'Premium':premium}])
    df = pd.concat([df,tmp], ignore_index=True)
    
    

In [None]:
df.describe

## Insert NaN balance values and unknown country information

In [None]:
country_array = ["France","Germany","Belgium","England","Spain","Italy","Denmark","Austria","Holland"];

#add 10 wrong NaN numbers for balance
for i in range(10):
    random.seed(10+i)
    age = random.randrange(18,75)
    gender = random.choice(["Male","Female"])
    country = random.choice(country_array)
    premium = random.choice(["N","Y"])
    tmp=pd.DataFrame([{'Age':age,'Balance':np.nan,'Gender':gender,'Country':country,'Premium':premium}])
    df = pd.concat([df,tmp], ignore_index=True)

# add 10 countries with unknown value
for i in range(10):
    random.seed(25+i)
    bal = random.randrange(5000,55000)
    age = random.randrange(18,75)
    gender = random.choice(["Male","Female"])
    premium = random.choice(["N","Y"])
    tmp=pd.DataFrame([{'Age':age,'Balance':bal,'Gender':gender,'Country':"unknown",'Premium':premium}])
    df = pd.concat([df,tmp], ignore_index=True)  

df.describe

In [None]:
# Shuffle the dataset
df = df.sample(frac=1)
df.describe

## Write the data into "bank_data.csv"

In [None]:
df.to_csv("bank_data_{}.csv".format(random_data_size),index=False)