# Tabular data examples

In [None]:
import numpy as np
import pandas as pd

## Generate data: `from one to many`

In [None]:
def random_dates(start, end, n=10):

    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

In [None]:
date_range = np.arange(1986, 2021)
date_range

In [None]:
min_max_break_points=(3,9)
break_points = np.random.randint(min_max_break_points[0], min_max_break_points[1])
date_range = [int(x) for x in np.linspace(1986, 2020, break_points+1)]
for break_point in range(break_points):
    print(date_range[break_point], date_range[break_point+1])

In [None]:
date_range

In [None]:
date_range

In [None]:
break_points

In [None]:
[int(x) for x in np.linspace(1986, 2021, 3+1)]

In [None]:
def random_dataset(min_max_rows = (100, 5000), min_max_break_points=(3,9)):
    break_points = np.random.randint(min_max_break_points[0], min_max_break_points[1])
    date_range = [int(x) for x in np.linspace(1986, 2020, break_points+1)]
    dfs = []
    for break_point in range(break_points):
        N = np.random.randint(min_max_rows[0], min_max_rows[1])    
        start = pd.to_datetime(f'{date_range[break_point]}-01-01')
        end = pd.to_datetime(f'{date_range[break_point+1]}-12-31')
        date = random_dates(start, end, n=N)
        var1 = np.random.random(N)
        p = np.random.random()
        var2 = np.random.choice(['in progress', 'completed'], size=N, p=(p,1-p))
        p = np.random.random()
        var3 = np.random.choice([True, False],  size=N, p=(p,1-p))
        rand_table = pd.DataFrame({'Date':date, 'Estimate':var1, 'Status':var2, 'Success':var3})
        dfs.append(rand_table)
    return pd.concat(dfs).reset_index(drop=True)

In [None]:
start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2021-06-10')
random_dates(start, end)

In [None]:
rand_table = random_dataset()
rand_table

In [None]:
name = ['Beiersdorf AG', 'Beiersdorf Customer Supply GmbH', 'Beiersdorf Manufacturing Berlin GmbH', 
       'Beiersdorf Manufacturing Hamburg GmbH', 'Beiersdorf Manufacturing Waldheim GmbH',"Beiersdorf Shared Services GmbH",
       "La Prairie Group Deutschland GmbH", "WINGMAN-STUDIOS GmbH"]
address = ['Unnastrasse 48', 'Unnastrasse 48', 'Franklinstrasse 1', 'Troplowitzstrasse 10', 'Am Eichberg', 
          'Quickbornstrasse 24', 'Lange StraÃŸe 65', 'Troplowitzstrasse 10']
zip_code = ['20253', '20253', '10587', '22529', '04736', '20253', '76530', '22529']
city = ['Hamburg', 'Hamburg', 'Berlin', 'Hamburg','Waldheim', 'Hamburg', 'Baden-Baden' 'Hamburg']
country = ['Germany'] * len(city)

headquaters = list(zip(name, address, zip_code, city, country))
headquater_columns=['Name', 'Address', 'Zip-code', 'City', 'Country']

In [None]:
dfs = []
for headquater in headquaters:
    rand_table = random_dataset()
    rand_table[headquater_columns] = headquater
    dfs.append(rand_table)
data = pd.concat(dfs).sort_index().reset_index(drop=True)
data.shape

In [None]:
data.head(10)

In [None]:
data.sample(10)

In [None]:
data.to_csv('../data/fake_beiersdorf_data_german.csv', sep=';', index=False)
data.to_csv('../data/fake_beiersdorf_data.csv', index=False)

> Note: The data set in generated randomly, running the notebook again may alter downstream analysis