In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Combine and Merging Data Sets
Data contained in Pandas objects can be combined together in a number of built-in ways:
 - `pandas.merge` connects rows in `DataFrame` base on one or more keys. This will be familiar to users of SQL or other relational database, as it implement database `join` operations.
 
 
 - `pandas.concat` glues or stack together objects along an axis.
 
 
 - `combine_first` instance method enables splicing together overlapping data to fill in missing values in on object with values from another.

### Database-style DataFrame Merge

`Merge` or `join` operations combine data set by linking row using on or more *`keys`*.

In [12]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
print('df1 => \n', df1)
print('df2 => \n', df2)

df1 => 
    data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
df2 => 
    data2 key
0      0   a
1      1   b
2      2   d


**If not specified, `merge` uses the overlapping columns names as the keys.**

In [14]:
pd.merge(df1, df2, on = 'key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


**If the columns names are difference in each object, you can specify them separately:**

In [17]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 
                 'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})

By default merge does an `inner` join; the keys in he result are the intersection.

Other possible options are `left` and `right` and `outer`.

The `Outer` join takes the union of the keys.

In [18]:
pd.merge(left = df3, right = df4, left_on = 'lkey', right_on = 'rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


In [19]:
pd.merge(left = df3, right = df4, how = 'outer', 
         left_on = 'lkey', right_on = 'rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0.0,b,1.0,b
1,1.0,b,1.0,b
2,6.0,b,1.0,b
3,2.0,a,0.0,a
4,4.0,a,0.0,a
5,5.0,a,0.0,a
6,3.0,c,,
7,,,2.0,d


In [23]:
# Many-to-many
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'], 'data2': range(5)})
print('df1:\n', df1)
print('df2:\n', df2)

df1:
    data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
df2:
    data2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d


In [22]:
# Many-to-many joins from the Cartesian product of the rows.
pd.merge(left = df1, right = df2, on = 'key', how = 'left')

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,0,b,3.0
2,1,b,1.0
3,1,b,3.0
4,2,a,0.0
5,2,a,2.0
6,3,c,
7,4,a,0.0
8,4,a,2.0
9,5,b,1.0


In [26]:
pd.merge(df1,df2, how = 'inner').sort_values(by = 'key')

Unnamed: 0,data1,key,data2
6,2,a,0
7,2,a,2
8,4,a,0
9,4,a,2
0,0,b,1
1,0,b,3
2,1,b,1
3,1,b,3
4,5,b,1
5,5,b,3


### To merge with multiple keys, pass a list of column names:

In [29]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})

right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one','one', 'two'], 
                   'rval': [4, 5, 6, 7]})
print('Left data :\n', left)
print('Right data :\n', right)

Left data :
   key1 key2  lval
0  foo  one     1
1  foo  two     2
2  bar  one     3
Right data :
   key1 key2  rval
0  foo  one     4
1  foo  one     5
2  bar  one     6
3  bar  two     7


In [30]:
pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


`Merge` has a `suffixes` options for specifying string to append to overlapping names in the left and right DataFrame objects:

In [31]:
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [33]:
pd.merge(left, right, on = 'key1', suffixes = ('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


### Merging on Index

`left_index = True` or `right_index = True`(or both) to indicate that the index should be used as the merge key

In [3]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})

right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

In [5]:
print('left1:\n', left1)
print('right1:\n', right1)

left1:
   key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
right1:
    group_val
a        3.5
b        7.0


In [10]:
# intersect the join keys
print('inner:\n', pd.merge(left1, right1, left_on = 'key', right_index = True))
print('outer:\n', pd.merge(left1, right1, left_on = 'key', 
                           right_index = True, how = 'outer'))

inner:
   key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0
outer:
   key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0
5   c      5        NaN


In [12]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})

righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1', 'event2'])

print('lefth:\n', lefth)
print('right:\n', righth)

lefth:
    data    key1  key2
0   0.0    Ohio  2000
1   1.0    Ohio  2001
2   2.0    Ohio  2002
3   3.0  Nevada  2001
4   4.0  Nevada  2002
right:
              event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11


In [16]:
# set columns to DataFrame's index 
# `reset_index`, does the opposite of `set_index`
lefth.set_index(['key1', 'key2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,data
key1,key2,Unnamed: 2_level_1
Ohio,2000,0.0
Ohio,2001,1.0
Ohio,2002,2.0
Nevada,2001,3.0
Nevada,2002,4.0


In [17]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index = True)

Unnamed: 0,data,key1,key2,event1,event2
0,0.0,Ohio,2000,4,5
0,0.0,Ohio,2000,6,7
1,1.0,Ohio,2001,8,9
2,2.0,Ohio,2002,10,11
3,3.0,Nevada,2001,0,1


### Use the indexes of both side of the merge

In [19]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], 
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                   index=['b', 'c', 'd', 'e'], 
                   columns=['Missouri', 'Alabama'])
print('left2:\n', left2)
print('right2:\n', right2)

left2:
    Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0
right2:
    Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0


In [21]:
pd.merge(left2, right2, how = 'outer', 
         left_index = True, right_index = True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


### `DataFrame` has a more convenient `join` for merging by index.

In [23]:
# join by index
left2.join(other = right2, how = 'outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [24]:
left1.join(right1, on = 'key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [26]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], 
                    index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])

In [27]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


### Concatenating Along an Axis

Another kind of data combination operation is alternatively referred to as concatenation, binding, or stacking.

In [31]:
arr = np.arange(12).reshape((3, 4))
print('row bind:\n', np.concatenate([arr, arr], axis = 0))
print('column bind:\n', np.concatenate([arr, arr], axis = 1))

row bind:
 [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
column bind:
 [[ 0  1  2  3  0  1  2  3]
 [ 4  5  6  7  4  5  6  7]
 [ 8  9 10 11  8  9 10 11]]


In [8]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
print('s1:\n', s1)
print('s2:\n', s2)
print('s3:\n', s3)

s1:
 a    0
b    1
dtype: int64
s2:
 c    2
d    3
e    4
dtype: int64
s3:
 f    5
g    6
dtype: int64


By default `concat` works along `axis = 0` producing another Series. If you pass `axis = 1`, the result will instead be a `DataFrame`(axis = 1 is the columns)

In [9]:
print('axis = 0 \n', pd.concat([s1, s2, s3]))
print('axis = 1 \n', pd.concat([s1, s2, s3], axis = 1))

axis = 0 
 a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
axis = 1 
      0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0


In [10]:
s4 = pd.concat([s1 * 5, s3])
pd.concat([s1, s4], axis =1) # default join method is outer

Unnamed: 0,0,1
a,0.0,0
b,1.0,5
f,,5
g,,6


You can even specify the axes to be used on the other axes woth `join_axis`

In [11]:
pd.concat([s1, s4], axis = 1, join_axes = [['a', 'c', 'b', 'e']])

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,5.0
e,,


**One issue is that the concatenated pieces aren't identidiable in the result.**

**Suppose instead you wanted to create a hierarchical index on the concatentation axis. To do this, use `keys` argument:**

In [12]:
result = pd.concat([s1, s1, s3], keys = ['one', 'two', 'three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [13]:
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [14]:
# if you don't pass the `key` argument, the column name is range(ncol).
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [2]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                columns=['three', 'four'])

In [6]:
pd.concat([df1, df2], axis = 1, keys=['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


If you pass a dict og object instead of a list, the dict's key will be used for the `keys` option.

In [17]:
pd.concat({'level1': df1, 'level2': df2}, axis = 1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [27]:
new_df = pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'], 
                   names = ['upper', 'lower'])
new_df

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [25]:
print(new_df['level1']['one'])
new_df.columns.levels # check the column's level 

a    0
b    2
c    4
Name: one, dtype: int32


FrozenList([['level1', 'level2'], ['four', 'one', 'three', 'two']])

A last consideration concerns DataFrames in which the row index is not meaningful in the context of the analysis

In [29]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns = ['b', 'd', 'a'])
print('df1:\n', df1)
print('df2:\n', df2)

df1:
           a         b         c         d
0  1.014075 -0.350096  1.061990  0.796465
1  0.610368  0.201695 -1.153115 -1.531511
2  1.182639  0.677872 -0.167538 -0.773363
df2:
           b         d         a
0 -0.614755 -0.983946 -0.578164
1 -0.960212 -0.397929 -0.439282


In [30]:
pd.concat([df1, df2], ignore_index = True)

Unnamed: 0,a,b,c,d
0,1.014075,-0.350096,1.06199,0.796465
1,0.610368,0.201695,-1.153115,-1.531511
2,1.182639,0.677872,-0.167538,-0.773363
3,-0.578164,-0.614755,,-0.983946
4,-0.439282,-0.960212,,-0.397929


### Combining Data with Overlap

In [36]:
a = pd.Series(data = [np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
             index = ['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype = np.float64), 
             index = ['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
print('a:\n', a)
print('b:\n', b)

a:
 f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
b:
 f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64


`Series` has a `combine_first` method, which perform the equivalent of this operation plus data alignment

**Docstring: Combine two DataFrame objects and default to non-null values in frame calling the method. Result index columns will be the union of the respective indexes and columns.**

In [38]:
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [45]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan], 
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': list(range(2, 16, 4))})

df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})

In [48]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


### Reshaping and Pivoting

Hierarchical indexing provides a consistent way to rerrange data in a DataFrame.

There are two primary action:

- `stack`: this "rotates" or privots __*from the columns in the data to the rows.*__

- `unstack`: this privots __*from the rows into the columns*.__

In [52]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index = pd.Index(['Ohio', 'Colorado'], name = 'state'),
                    columns = pd.Index(['one', 'two', 'three'], name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


Using the `stack` method on this data pivots the columns into the rows, producing a Series

In [67]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [71]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [60]:
result.unstack('state') # index to columns

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [3]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

Unstacking might introduce missing data if all the values in the level aren't found in each subgroups

In [75]:
# default level 1 index to be columns
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


Stacking filter out missing data by default, so the operation is easily invertible

In [78]:
data2.unstack().stack(dropna = False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

When unstacking in a DataFrame, the level unstacked becomes the lowest level in the result:

In [84]:
df = pd.DataFrame({'left': result, 'right': result + 5},
                    columns = pd.Index(['left', 'right'], name = 'side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [85]:
df.unstack('state') # `state` as the column names

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [86]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


### Data Transformation

#### Remove duplicates

In [6]:
data = pd.DataFrame({'k1': ['one']*3 + ['two']*4,
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


_**The DataFrame method `duplicated` returns a boolean Series indicating whether each row is a duplicate or not.**_

__The method `drop_duplicates` returns a DataFrame where the duplicated array is True.__

In [18]:
print(data.duplicated())
print(data.drop_duplicates())

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4


***Both of these method by default consider all of the columns; alternatively you can specify any subset of them to detect duplicates.***

In [19]:
data['v1'] = range(7)
data.drop_duplicates(subset = ['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


`duplicated` and `drop_duplicates` by default keep the first observed value combination.

In [20]:
data.drop_duplicates(subset = ['k1', 'k2'], keep = 'last')

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping

In [41]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


Suppose you wanted to add column indicating the type of animal that each food come from.Let's write down a mapping of each distinct meat type to the kind of animal.

_***Using map is a convenient way to perform element-wise transformations and other data cleaning-related opeartion.**_

In [42]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [43]:
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


Could also have passed a function that does all the work

In [26]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### Replacing Value

Filling in missing data with `fillna` method can be thought of as a special case of more general value replacement.

While map, as you have seen above, can be used to modify a subset of values in an object, `replace` provides a simpler and more flexible way to do.

In [48]:
data = pd.Series(data = [1., -999., 2., -999, -1000, 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [49]:
print(data.replace(to_replace = [-999., -1000], value = np.nan))
print("-"*3 + ' Use different replacement  for each value.' + '-'*3)
print(data.replace(to_replace = [-999., -1000], value = [np.nan, 0]))
print("-"*3 + ' The argument also can be a dict' + '-'*3)
print(data.replace({-999.: np.nan, -1000: 0}))

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
--- Use different replacement  for each value.---
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
--- The argument also can be a dict---
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64


### Renaming Axis Indexes

Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects.

In [50]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


If you want to creat a transformed version of a data set without modifying the original, a useful methd is `rename`

In [55]:
data.rename(index = str.title, columns = str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [58]:
data.rename(index = {"Ohio": "INDIANA"}, 
            columns = {'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


### Discretization and Binning

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

Let's divide these into bins of 18 to 25, 26 to 35, 35 to 60, and finally 60 and order.

Consistent with mathematical notation for intervals, a parenthesis means hat the side is `open` while the square braket means it is `closed`(inclusive).

In [24]:
# "()"開區間: 不包含等號, "[]"閉區間:包含等號
bins = [18, 25, 35, 60, 100]
cats = pd.cut(x = ages, bins = bins, right = False)
cats

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

The object pandas returns is a special `Categorical` object.

In [25]:
cats.codes

array([0, 0, 1, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [26]:
cats.categories

IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)]
              closed='left',
              dtype='interval[int64]')

In [27]:
cats.value_counts()

[18, 25)     4
[25, 35)     4
[35, 60)     3
[60, 100)    1
dtype: int64

_**You can also pass your own bin names by passing a list or array to the `labels` option.**_

In [29]:
group_names = ["Youth", 'YounthAdult', 'MiddleAged', 'Senior']
pd.cut(x = ages, bins = bins, labels = group_names, right = False)

[Youth, Youth, YounthAdult, YounthAdult, Youth, ..., YounthAdult, Senior, MiddleAged, MiddleAged, YounthAdult]
Length: 12
Categories (4, object): [Youth < YounthAdult < MiddleAged < Senior]

_**If you pass `cut` a integer number of bins instead of explicit bin edges, it will compute equal-length bins based on the minmum and maximum values in the data.**_

In [35]:
data = np.random.rand(20)
pd.cut(x = data, bins = 4, precision =2, right = False)

[[0.71, 0.94), [0.48, 0.71), [0.48, 0.71), [0.24, 0.48), [0.24, 0.48), ..., [0.48, 0.71), [0.24, 0.48), [0.011, 0.24), [0.48, 0.71), [0.011, 0.24)]
Length: 20
Categories (4, interval[float64]): [[0.011, 0.24) < [0.24, 0.48) < [0.48, 0.71) < [0.71, 0.94)]

A closely related function `qct`, bins the data based on sample quantiles. 

Depending on the distribution of the data, using `cut` will not result in each bin having the same number of data points.

Scince `qcut` uses sample quantiles instead, by definition you will obtain roughly equal-size bins.

In [36]:
data = np.random.randn(1000)
cats = pd.qcut(x = data, q = 4)
cats

[(-2.872, -0.701], (0.048, 0.709], (-2.872, -0.701], (0.048, 0.709], (0.048, 0.709], ..., (-2.872, -0.701], (-2.872, -0.701], (0.709, 3.63], (0.048, 0.709], (0.048, 0.709]]
Length: 1000
Categories (4, interval[float64]): [(-2.872, -0.701] < (-0.701, 0.048] < (0.048, 0.709] < (0.709, 3.63]]

In [37]:
pd.value_counts(values = cats)

(0.709, 3.63]       250
(0.048, 0.709]      250
(-0.701, 0.048]     250
(-2.872, -0.701]    250
dtype: int64

Similar to cut you can pass your own quantiles(number between 0 and 1, inclusive.)

In [40]:
pd.qcut(x = data, q = [0, 0.1, 0.5, 0.9, 1.])

[(-2.872, -1.349], (0.048, 1.265], (-1.349, 0.048], (0.048, 1.265], (0.048, 1.265], ..., (-1.349, 0.048], (-1.349, 0.048], (1.265, 3.63], (0.048, 1.265], (0.048, 1.265]]
Length: 1000
Categories (4, interval[float64]): [(-2.872, -1.349] < (-1.349, 0.048] < (0.048, 1.265] < (1.265, 3.63]]

### Detecting and Filtering Outliers

In [41]:
np.random.seed(12345)
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [45]:
col = data[3]
col[np.abs(col) > 3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

To select all rows having a value exceeding 3 or -3, you can use the any method on a boolean DataFrame 

In [51]:
# 只要有一筆資料絕對值大於三，則五筆資料就會回傳True
data[(np.abs(data) > 3).any(axis = 1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


### Permutation and Random Sampling

In [2]:
df = pd.DataFrame(data = np.arange(5*4).reshape(5, 4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


Calling `permutation` with the length of the axis you want to permute produces an array of integers indicating the new ordering.

In [10]:
sampler = np.random.permutation(5)
sampler

array([3, 4, 0, 2, 1])

In [11]:
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11
1,4,5,6,7


To select a random subset without replacement, one way is slice off the first k element of the array returned by `permutation`, where k is the desired subset size.

There are much more efficient sampling-without-replacement algorithms, but this is an easy strategy that uses readily available tools.

In [14]:
df.take(np.random.permutation(len(df))[:3])

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
1,4,5,6,7


To generate a sample with replacement, the faster way is to use `np.random.radint` to draw random integers.

In [22]:
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(low = 0, high = len(bag), size = 10)
print(sampler) 

[0 2 2 4 4 3 0 1 1 4]


In [23]:
bag.take(sampler)

array([ 5, -1, -1,  4,  4,  6,  5,  7,  7,  4])

### Computing Indicator/Dummy Variable

Another type of transformation for statistical modeling or machine learning application is converting a categorical variable into a `dummy` or `indicator` matrix.

In [34]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [43]:
dummies = pd.get_dummies(df['key'], prefix = 'key')

In [44]:
pd.concat(objs = [df, dummies], axis = 1).drop(columns = ['key'], axis = 1)

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [45]:
df.join(other = dummies).drop(columns = 'key', axis = 1)

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


### Using movies data as an example to implement dummy variable.

In [73]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('.\\pydata-book\\datasets\\movielens\\movies.dat', 
                       sep = "::", header = None, names = mnames)
movies.head(5)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Adding `indicator variables` (or `dummy variable`) for each `genre` requires a little bit of wrangling.

First, we extract the list of unique genres in the dataset(using a nice `set.union trick`)

In [74]:
genre_iter = (set(x.split('|')) for x in movies.genres)
print(type(genre_iter))
genres = sorted(set.union(*genre_iter))
genres

<class 'generator'>


['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

One way to construct the indicator DataFrame is to star with a DataFrame of all zeros.

In [62]:
dummies = pd.DataFrame(np.zeros((len(movies), len(genres))), columns = genres)
dummies.head(5)

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now, iterate through each movie and set entries in each row of dummies to 1.

In [71]:
for i, gen in enumerate(movies.genres):
    dummies.ix[i, gen.split('|')] = 1

dummies.head(3)

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [84]:
movies_windic = movies.join(other = dummies.add_prefix('Genre_'))
movies_windic[:3]

Unnamed: 0,movie_id,title,genres,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Crime,Genre_Documentary,...,Genre_Fantasy,Genre_Film-Noir,Genre_Horror,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


***A useful recipe of statistocal applications is to combine `get_dummies` with a discretization function like `cut`.***

In [91]:
np.random.seed(123)
values = np.random.rand(10)
print(values)

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
label_names = ['g1', 'g2', 'g3', 'g4', 'g5']
pd.get_dummies(pd.cut(x = values, bins = bins, labels = label_names))

[0.69646919 0.28613933 0.22685145 0.55131477 0.71946897 0.42310646
 0.9807642  0.68482974 0.4809319  0.39211752]


Unnamed: 0,g1,g2,g3,g4,g5
0,0,0,0,1,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,0,1,0,0
4,0,0,0,1,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,1,0,0
9,0,1,0,0,0


### String Maniplation
#### String Object Methods

In [92]:
val = 'a,b, guido'
val.split(",")

['a', 'b', ' guido']

**`split`** is often combined with **`strip`** to trim whitespace(including newlines).

In [93]:
pieces = [x.strip() for x in val.split(",")]
pieces

['a', 'b', 'guido']

These substrings colud be concatented together with a two-colon delimiter using addition

In [94]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

But, this isn't a partical generic method. A faster and more Pythonic way is to pass a list or tuple to `join` method on the string `'::'`.

In [95]:
"::".join(pieces)

'a::b::guido'

***Other methods are concerned with locating substring. Using Python's keyword is the best way to detect a substring, though `index` and `find` can also be used.***

***Note the difference between `find` and `index` is that `index` raise an expection if the string isn't found(versus returning -1).***

In [99]:
print('val:', val)
print('guido' in val)
print(val.index(','))
print(val.find(":"))

val: a,b, guido
True
1
-1


In [100]:
val.count(',')

2

### Regular expression

***`Regular Expression`*** provide a flexible way to search or match string pattern in text.

The ***`re`*** module functions fall into three categories: 
 1. pattern
 2. substitution 
 3. splitting.

In [2]:
import re

In [3]:
text = "foo bar\t baz \tqux"
text

'foo bar\t baz \tqux'

When you call `re.split('\s+', text)`, the regular expression is frist complied, then its `split` method is called on the passed text.

You can complie the regex yourself with `re.complie`, forming a reusable regex object.

In [4]:
print(re.split(pattern = '\s+', string = text))
split_space = re.compile(pattern = '\s+')
print(split_space.split(text))

['foo', 'bar', 'baz', 'qux']
['foo', 'bar', 'baz', 'qux']


**You wanted to get a list of all patterns matching the regex, you can use the `findall` method.**

In [5]:
split_space.findall(text)

[' ', '\t ', ' \t']

To avoid unwanted escaping with \ in regular expression, use raw string literals like `r"C:\x"` instead of the equivalent `"C:\\x"`.

***`match` and `search` are closely related to `findall`.***

*While `findall` returns all matches in a string, `search` retruns only the first match.* 

*More rigidly, `match` only matches at the beginning of the string.*

In [6]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [7]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags = re.IGNORECASE)

In [8]:
m = regex.findall(text)
m

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

`search` returns a special match object for the first email address in the text.

**For the above regex, the match object can only tell us the start and end position of the pattern in the string.**

In [9]:
m = regex.search(text)
m

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [10]:
text[m.start():m.end()]

'dave@google.com'

`regex.match` returns `None`, as it only will match if the pattern occurs at the start of the string.

In [11]:
print(regex.match(text))

None


Relatedly, `sub` will return a new string with occurrences of the pattern replaced by the a new string.

In [12]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



***Suppose you wanted to find email address and simultaneously segment each address into its 3 components: `username`, `domain name`, and `domain suffix`.***

***To do this, put parentheses around the part of the pattern to segment.***

In [49]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [50]:
regex = re.compile(pattern, flags = re.IGNORECASE)

In [55]:
m = regex.match(string ='wesm@bright.net' )
m.groups()

('wesm', 'bright', 'net')

In [16]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

***`sub` also has access to groups in each match using special symbols like `\1`, `\2`, etc.***

***sub, subn:***

***Replace all(sub) or first n occurances(subn) of pattern in string woth replacement expression.***

***Use symbols `\1`, `\2`, ... to refer to match group elements in the replacment string.***

In [17]:
regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text)

'Dave Username: dave, Domain: google, Suffix: com\nSteve Username: steve, Domain: gmail, Suffix: com\nRob Username: rob, Domain: gmail, Suffix: com\nRyan Username: ryan, Domain: yahoo, Suffix: com\n'

In [18]:
regex = re.compile(r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)

In [19]:
m = regex.match('wesm@bright.net')

In [20]:
m.groupdict()

{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

### Vectorized string functions in Pandas

In [24]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 
        'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [25]:
data = pd.Series(data)
data

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

***String and regular expression methods can be applied (passing a `lambda` or other function) to each value using `data.map`, but it will fail on the NA.***

***To cope with this, Series has concise methods for string operations that skip NA values.***

***These are accessed through Series's `str` attribute; for example, we could check email address has `"gmail"` in it with `str.contains`.***

In [26]:
data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

In [56]:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
data.str.findall(pat = pattern, flags = re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

In [57]:
matches = data.str.extract(pattern, flags = re.IGNORECASE, expand = True)
matches

Unnamed: 0,0,1,2
Dave,dave,google,com
Rob,rob,gmail,com
Steve,steve,gmail,com
Wes,,,


In [58]:
matches.get(0)

Dave      dave
Rob        rob
Steve    steve
Wes        NaN
Name: 0, dtype: object

You can similarly slice string using this stntax.

In [59]:
data.str[:5]

Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

### Example : USDA Food Database

In [3]:
import json

In [8]:
db = json.load(open('.\\pydata-book\\datasets\\usda_food\\database.json'))
len(db)

6636

In [9]:
db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

In [10]:
nutrients = pd.DataFrame(data = db[0]["nutrients"])
nutrients.head(3)

Unnamed: 0,description,group,units,value
0,Protein,Composition,g,25.18
1,Total lipid (fat),Composition,g,29.2
2,"Carbohydrate, by difference",Composition,g,3.06


In [11]:
info_keys = ['description', 'group', 'id', 'manufacture']
info = pd.DataFrame(db, columns = info_keys)
info.head(3)

Unnamed: 0,description,group,id,manufacture
0,"Cheese, caraway",Dairy and Egg Products,1008,
1,"Cheese, cheddar",Dairy and Egg Products,1009,
2,"Cheese, edam",Dairy and Egg Products,1018,


In [12]:
pd.value_counts(info.group)[:10]

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64

Now, to do some analysis on all of the nutrient data, it's easiest tot assemble the nutrients for each food into a single large table.

First, I will convert each list of food nutrients to a DataFrame, add a column for the food `id`, and append the DataFrame to list.

Then, these can be concatented together with `concat`.

In [13]:
nutrients = []
for rec in db:
    fnuts = pd.DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

In [15]:
nutrients_df = pd.concat(nutrients, ignore_index = True)

In [16]:
nutrients_df.head(3)

Unnamed: 0,description,group,units,value,id
0,Protein,Composition,g,25.18,1008
1,Total lipid (fat),Composition,g,29.2,1008
2,"Carbohydrate, by difference",Composition,g,3.06,1008


In [18]:
nutrients_df.duplicated().sum()

14179

In [19]:
nutrients_df = nutrients_df.drop_duplicates()

In [20]:
col_mapping = {'description': 'food', 'group': 'fgroup'}

In [21]:
info = info.rename(columns=col_mapping, copy=False)

In [None]:
col_mapping = {'description' : 'nutrient', 'group' : 'nutgroup'}

In [None]:
nutrients = nutrients.rename(columns=col_mapping, copy=False)

In [None]:
ndata = pd.merge(nutrients, info, on='id', how='outer')

In [None]:
ndata.head(5)

In [None]:
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)

In [None]:
result['Zinc, Zn'].order().plot(kind='barh')