# From one dataset to many

**Load the data**

In [1]:
import pandas as pd
data = pd.read_csv('../data/fake_beiersdorf_data.csv')
data.shape

(5000, 9)

In [2]:
data.head()

Unnamed: 0,Date,Var1,Var2,Var3,Name,Address,Zip-code,City,Country
0,2006-06-24,0.56453,completed,False,La Prairie Group Deutschland GmbH,Lange Straße 65,76530,Baden-BadenHamburg,Germany
1,2007-06-24,0.114768,completed,False,Beiersdorf Manufacturing Hamburg GmbH,Troplowitzstrasse 10,22529,Hamburg,Germany
2,2014-05-27,0.793343,in progress,False,Beiersdorf AG,Unnastrasse 48,20253,Hamburg,Germany
3,2009-12-10,0.785397,completed,True,Beiersdorf Manufacturing Waldheim GmbH,Am Eichberg,4736,Waldheim,Germany
4,2016-01-15,0.360336,in progress,True,La Prairie Group Deutschland GmbH,Lange Straße 65,76530,Baden-BadenHamburg,Germany


## Split large dataset into many small ones based on the variable `Name`

In [3]:
data.Name

0            La Prairie Group Deutschland GmbH
1        Beiersdorf Manufacturing Hamburg GmbH
2                                Beiersdorf AG
3       Beiersdorf Manufacturing Waldheim GmbH
4            La Prairie Group Deutschland GmbH
                         ...                  
4995         La Prairie Group Deutschland GmbH
4996         La Prairie Group Deutschland GmbH
4997           Beiersdorf Customer Supply GmbH
4998           Beiersdorf Shared Services GmbH
4999                             Beiersdorf AG
Name: Name, Length: 5000, dtype: object

In [4]:
data.Name.unique()

array(['La Prairie Group Deutschland GmbH',
       'Beiersdorf Manufacturing Hamburg GmbH', 'Beiersdorf AG',
       'Beiersdorf Manufacturing Waldheim GmbH',
       'Beiersdorf Shared Services GmbH',
       'Beiersdorf Manufacturing Berlin GmbH',
       'Beiersdorf Customer Supply GmbH'], dtype=object)

In [5]:
data.Name.nunique()

7

In [6]:
for name in data.Name.unique():
    print(f'Computing {name} ...')
    subset = data.loc[data.Name == name]

Computing La Prairie Group Deutschland GmbH ...
Computing Beiersdorf Manufacturing Hamburg GmbH ...
Computing Beiersdorf AG ...
Computing Beiersdorf Manufacturing Waldheim GmbH ...
Computing Beiersdorf Shared Services GmbH ...
Computing Beiersdorf Manufacturing Berlin GmbH ...
Computing Beiersdorf Customer Supply GmbH ...


In [7]:
subset.sample(10)

Unnamed: 0,Date,Var1,Var2,Var3,Name,Address,Zip-code,City,Country
4059,2015-08-27,0.428914,in progress,False,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
145,2007-05-08,0.154952,in progress,False,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
26,2012-02-06,0.520463,in progress,True,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
3095,2006-07-12,0.571239,completed,True,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
2149,2017-11-11,0.18807,in progress,False,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
4296,2011-12-09,0.873076,in progress,True,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
4994,2018-06-09,0.868741,completed,True,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
3474,2011-03-25,0.928925,in progress,False,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
2749,2018-05-04,0.237094,completed,False,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
549,2016-11-10,0.426424,in progress,True,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany


In [None]:
import os
for name in data.Name.unique():
    print(f'Computing {name} ...')
    subset = data.loc[data.Name == name]
    fname = f'subset_{name.replace(" ", "_")}.csv'
    filepath = os.path.join('..', 'data', 'interim', fname)
    subset.to_csv(filepath, index=False)

***

# From many to one

In [None]:
!ls ../data/interim/

In [None]:
import glob

In [None]:
filelist = glob.glob(os.path.join('..', 'data', 'interim', '*.csv'))
filelist

In [None]:
dfs = []
for f in filelist:
    data = pd.read_csv(f)
    dfs.append(data)
fulldata = pd.concat(dfs)
fulldata.shape

In [None]:
fulldata.sample(10)

***