# From one dataset to many

**Load the data**

In [1]:
import pandas as pd
data = pd.read_csv('../data/fake_beiersdorf_data.csv')
data.shape

(5000, 9)

In [2]:
data.head()

Unnamed: 0,Date,Var1,Var2,Var3,Name,Address,Zip-code,City,Country
0,2011-11-20,0.764554,completed,True,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
1,2006-11-27,0.380247,in progress,False,Beiersdorf Shared Services GmbH,Quickbornstrasse 24,20253,Hamburg,Germany
2,2007-09-29,0.807108,completed,False,Beiersdorf Customer Supply GmbH,Unnastrasse 48,20253,Hamburg,Germany
3,2006-09-05,0.649908,in progress,True,Beiersdorf Manufacturing Hamburg GmbH,Troplowitzstrasse 10,22529,Hamburg,Germany
4,2014-12-12,0.951206,in progress,True,Beiersdorf Shared Services GmbH,Quickbornstrasse 24,20253,Hamburg,Germany


## Split large dataset into many small ones based on the variable `Name`

In [3]:
data.Name

0              Beiersdorf Customer Supply GmbH
1              Beiersdorf Shared Services GmbH
2              Beiersdorf Customer Supply GmbH
3        Beiersdorf Manufacturing Hamburg GmbH
4              Beiersdorf Shared Services GmbH
5              Beiersdorf Customer Supply GmbH
6              Beiersdorf Customer Supply GmbH
7              Beiersdorf Shared Services GmbH
8            La Prairie Group Deutschland GmbH
9              Beiersdorf Shared Services GmbH
10                               Beiersdorf AG
11      Beiersdorf Manufacturing Waldheim GmbH
12                               Beiersdorf AG
13      Beiersdorf Manufacturing Waldheim GmbH
14           La Prairie Group Deutschland GmbH
15       Beiersdorf Manufacturing Hamburg GmbH
16             Beiersdorf Customer Supply GmbH
17      Beiersdorf Manufacturing Waldheim GmbH
18        Beiersdorf Manufacturing Berlin GmbH
19        Beiersdorf Manufacturing Berlin GmbH
20             Beiersdorf Customer Supply GmbH
21           

In [4]:
data.Name.unique()

array(['Beiersdorf Customer Supply GmbH',
       'Beiersdorf Shared Services GmbH',
       'Beiersdorf Manufacturing Hamburg GmbH',
       'La Prairie Group Deutschland GmbH', 'Beiersdorf AG',
       'Beiersdorf Manufacturing Waldheim GmbH',
       'Beiersdorf Manufacturing Berlin GmbH'], dtype=object)

In [5]:
data.Name.nunique()

7

In [6]:
for name in data.Name.unique():
    print(f'Computing {name} ...')
    subset = data.loc[data.Name == name]

Computing Beiersdorf Customer Supply GmbH ...
Computing Beiersdorf Shared Services GmbH ...
Computing Beiersdorf Manufacturing Hamburg GmbH ...
Computing La Prairie Group Deutschland GmbH ...
Computing Beiersdorf AG ...
Computing Beiersdorf Manufacturing Waldheim GmbH ...
Computing Beiersdorf Manufacturing Berlin GmbH ...


In [7]:
subset.sample(10)

Unnamed: 0,Date,Var1,Var2,Var3,Name,Address,Zip-code,City,Country
2806,2010-05-28,0.843268,in progress,True,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
3729,2010-03-19,0.319698,in progress,False,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
1292,2010-10-06,0.06506,in progress,False,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
113,2016-01-01,0.806292,in progress,False,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
2991,2011-01-15,0.410043,in progress,False,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
4466,2017-04-08,0.464796,completed,False,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
3466,2011-12-18,0.655877,completed,False,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
581,2018-08-26,0.473276,completed,False,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
1239,2014-02-08,0.879735,completed,True,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany
4425,2009-08-11,0.135796,completed,False,Beiersdorf Manufacturing Berlin GmbH,Franklinstrasse 1,10587,Berlin,Germany


In [8]:
import os
for name in data.Name.unique():
    print(f'Computing {name} ...')
    subset = data.loc[data.Name == name]
    fname = f'subset_{name.replace(" ", "_")}.csv'
    filepath = os.path.join('..', 'data', 'interim', fname)
    subset.to_csv(filepath, index=False)

Computing Beiersdorf Customer Supply GmbH ...
Computing Beiersdorf Shared Services GmbH ...
Computing Beiersdorf Manufacturing Hamburg GmbH ...
Computing La Prairie Group Deutschland GmbH ...
Computing Beiersdorf AG ...
Computing Beiersdorf Manufacturing Waldheim GmbH ...
Computing Beiersdorf Manufacturing Berlin GmbH ...


***

# From many to one

In [9]:
!ls ../data/interim/

subset_Beiersdorf_AG.csv
subset_Beiersdorf_Customer_Supply_GmbH.csv
subset_Beiersdorf_Manufacturing_Berlin_GmbH.csv
subset_Beiersdorf_Manufacturing_Hamburg_GmbH.csv
subset_Beiersdorf_Manufacturing_Waldheim_GmbH.csv
subset_Beiersdorf_Shared_Services_GmbH.csv
subset_La_Prairie_Group_Deutschland_GmbH.csv


In [10]:
import glob

In [11]:
filelist = glob.glob(os.path.join('..', 'data', 'interim', '*.csv'))
filelist

['../data/interim/subset_La_Prairie_Group_Deutschland_GmbH.csv',
 '../data/interim/subset_Beiersdorf_Manufacturing_Berlin_GmbH.csv',
 '../data/interim/subset_Beiersdorf_Customer_Supply_GmbH.csv',
 '../data/interim/subset_Beiersdorf_AG.csv',
 '../data/interim/subset_Beiersdorf_Manufacturing_Waldheim_GmbH.csv',
 '../data/interim/subset_Beiersdorf_Shared_Services_GmbH.csv',
 '../data/interim/subset_Beiersdorf_Manufacturing_Hamburg_GmbH.csv']

In [12]:
dfs = []
for f in filelist:
    data = pd.read_csv(f)
    dfs.append(data)
fulldata = pd.concat(dfs)
fulldata.shape

(5000, 9)

In [13]:
fulldata.sample(10)

Unnamed: 0,Date,Var1,Var2,Var3,Name,Address,Zip-code,City,Country
57,2017-08-23,0.497905,completed,False,Beiersdorf Manufacturing Hamburg GmbH,Troplowitzstrasse 10,22529,Hamburg,Germany
95,2016-06-24,0.929574,in progress,True,Beiersdorf Shared Services GmbH,Quickbornstrasse 24,20253,Hamburg,Germany
268,2013-04-05,0.96449,completed,True,Beiersdorf AG,Unnastrasse 48,20253,Hamburg,Germany
436,2018-07-22,0.034841,in progress,False,Beiersdorf AG,Unnastrasse 48,20253,Hamburg,Germany
453,2016-01-22,0.6,in progress,False,Beiersdorf Shared Services GmbH,Quickbornstrasse 24,20253,Hamburg,Germany
223,2016-12-25,0.128373,in progress,True,La Prairie Group Deutschland GmbH,Lange Straße 65,76530,Baden-BadenHamburg,Germany
52,2015-07-04,0.454127,completed,False,Beiersdorf Manufacturing Hamburg GmbH,Troplowitzstrasse 10,22529,Hamburg,Germany
3,2017-02-15,0.628261,completed,True,Beiersdorf Shared Services GmbH,Quickbornstrasse 24,20253,Hamburg,Germany
328,2018-06-06,0.488463,completed,False,Beiersdorf Shared Services GmbH,Quickbornstrasse 24,20253,Hamburg,Germany
532,2017-07-17,0.564967,in progress,False,Beiersdorf Manufacturing Hamburg GmbH,Troplowitzstrasse 10,22529,Hamburg,Germany


***