# Data Wrangling

## Join, Combine and Reshape

## Importance of Hierarchical Indexing

In [1]:
# pd.Series.index?
# pd.Series.unstack?
# pd.names?
# pd.MultiIndex?

In [2]:
import pandas as pd
import numpy as np
data_hi = pd.Series(np.random.randn(9),
          index=[['A', 'A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
                 [1, 2, 3, 1, 4, 1, 2, 2, 4]])
data_hi

A  1    1.829514
   2    0.038414
   3   -0.295545
B  1    0.875844
   4    0.598066
C  1   -1.710379
   2    0.389891
D  2    1.452537
   4   -0.299041
dtype: float64

In [3]:
data_hi.index

MultiIndex([('A', 1),
            ('A', 2),
            ('A', 3),
            ('B', 1),
            ('B', 4),
            ('C', 1),
            ('C', 2),
            ('D', 2),
            ('D', 4)],
           )

In [4]:
data_hi['A']
# data_hi['A':'C']
# data_hi[['A', 'C']]
# data_hi.loc[:, 1]

1    1.829514
2    0.038414
3   -0.295545
dtype: float64

In [5]:
data_hi.unstack()
# data_hi.unstack(fill_value=0)
# data_hi.unstack().stack()

Unnamed: 0,1,2,3,4
A,1.829514,0.038414,-0.295545,
B,0.875844,,,0.598066
C,-1.710379,0.389891,,
D,,1.452537,,-0.299041


In [6]:
df_hi = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['one', 'one', 'three'],['Green', 'Red', 'Green']])
df_hi.index.names = ['val1', 'val2']
df_hi.columns.names = ['number', 'color']
df_hi['one']
df_hi

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### How Reordering and Sorting of Index Levels Takes Place?

In [7]:
# pd.DataFrame.swaplevel?
# pd.DataFrame.sort_index?

In [8]:
import pandas as pd
import numpy as np

df_hi

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [9]:
df_hi.swaplevel('val1', 'val2', axis=0)
df_hi.swaplevel('number', 'color', axis=1)
df_hi.swaplevel(0, 1).sort_index(level=0) 

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val2,val1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [10]:
df_hi.sort_index(level=0)
df_hi.sort_index(level=1)

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


### How To Get The Summary Statistics By Level?

In [11]:
# pd.DataFrame.sum?

In [12]:
import pandas as pd
import numpy as np
df_hi

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [13]:
df_hi.sum(level='val1') # all 'a' and 'b's are grouped first and then sum is applied
df_hi.sum(level='val2') # all '1' and '2's are grouped first and then sum is applied
df_hi.sum(level='color', axis=1) # in color index level 'Green' are grouped together first and then sum is applied.

  df_hi.sum(level='val1') # all 'a' and 'b's are grouped first and then sum is applied
  df_hi.sum(level='val2') # all '1' and '2's are grouped first and then sum is applied
  df_hi.sum(level='color', axis=1) # in color index level 'Green' are grouped together first and then sum is applied.


Unnamed: 0_level_0,color,Green,Red
val1,val2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### How To Index With DF's columns?

In [14]:
pd.DataFrame.set_index?

In [15]:
import pandas as pd
import numpy as np
df_c = pd.DataFrame({'a': range(7), 'b': range(14, 7, -1),
                     'c': ['one', 'one', 'one', 'two', 'two','two', 'two'],
                     'd': [0, 1, 2, 0, 1, 2, 3]})
df_c

Unnamed: 0,a,b,c,d
0,0,14,one,0
1,1,13,one,1
2,2,12,one,2
3,3,11,two,0
4,4,10,two,1
5,5,9,two,2
6,6,8,two,3


In [16]:
df_si = df_c.set_index(['c', 'd'])
df_si

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,14
one,1,1,13
one,2,2,12
two,0,3,11
two,1,4,10
two,2,5,9
two,3,6,8


In [17]:
df_c.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,14,one,0
one,1,1,13,one,1
one,2,2,12,one,2
two,0,3,11,two,0
two,1,4,10,two,1
two,2,5,9,two,2
two,3,6,8,two,3


In [18]:
df_si.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,14
1,one,1,1,13
2,one,2,2,12
3,two,0,3,11
4,two,1,4,10
5,two,2,5,9
6,two,3,6,8


### How To Combine and Merge Datasets?

In [19]:
# pd.DataFrame.merge?

In [20]:
import pandas as pd
import numpy as np
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                   'data2': range(3)})

df1.merge(df2)
pd.merge(df1, df2)
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,5,1
3,a,2,0
4,a,4,0


In [21]:
df_l = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df_r = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                   'data2': range(3)})

print('df_l', df_l, end='\n'), print('df_r', df_r, end='\n')

pd.merge(df_l, df_r, left_on='lkey', right_on='rkey')
pd.merge(df_l, df_r,  left_on='lkey', right_on='rkey', how='outer')

df_l   lkey  data1
0    b      0
1    b      1
2    a      2
3    c      3
4    a      4
5    b      5
df_r   rkey  data2
0    a      0
1    b      1
2    d      2


Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,5.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,c,3.0,,
6,,,d,2.0


In [22]:
df_m1 = pd.DataFrame({'1key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})
df_m2 = pd.DataFrame({'2key': ['a', 'b', 'a', 'b', 'd'],'data2': range(5)})

# let's ommit "how='outer'" or "how='inner'"
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key')                  

# let's include "how='outer'" or "how='inner'"
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='outer')  
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='left')    
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='right')

Unnamed: 0,1key,data1,2key,data2
0,a,2.0,a,0
1,a,4.0,a,0
2,b,0.0,b,1
3,b,1.0,b,1
4,b,5.0,b,1
5,a,2.0,a,2
6,a,4.0,a,2
7,b,0.0,b,3
8,b,1.0,b,3
9,b,5.0,b,3


In [23]:
dfleft = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga'],
                      'key2': ['one', 'two', 'one'],
                      'lval': [1, 2, 3]})

dfright = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga', 'anuraga'],
                       'key2': ['one', 'one', 'one', 'two'],
                       'rval': [4, 5, 6, 7]})

# 'outer' includes all the values and associated data
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='outer') 

# 'inner' includes only the  common values and associated data
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='inner') 
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='left') 
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='right')
pd.merge(dfleft, dfright, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,raga,one,1,one,4
1,raga,one,1,one,5
2,raga,two,2,one,4
3,raga,two,2,one,5
4,anuraga,one,3,one,6
5,anuraga,one,3,two,7


In [24]:
pd.merge(dfleft, dfright, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,raga,one,1,one,4
1,raga,one,1,one,5
2,raga,two,2,one,4
3,raga,two,2,one,5
4,anuraga,one,3,one,6
5,anuraga,one,3,two,7


### Merging on Row Index

In [25]:
# pd.DataFrame.join?

In [26]:
import pandas as pd
import numpy as np
dfril = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga', 'raga', 'adiraga'],
                       'lval': range(5)})

dfrir = pd.DataFrame({'rval': [1, 2]},
                         index = ['raga', 'anuraga'])

pd.merge(dfril, dfrir, left_on='key1', right_index=True)
pd.merge(dfril, dfrir, left_on='key1', right_index=True, how='outer')

Unnamed: 0,key1,lval,rval
0,raga,0,1.0
1,raga,1,1.0
3,raga,3,1.0
2,anuraga,2,2.0
4,adiraga,4,


#### Let's work with hierarchically indexed DataFrames
In hierachically indexed data the joining is implicitely a multiple key merge 

In [27]:
df_l = pd.DataFrame({'key1': ['raga', 'raga', 'raga', 'anuraga', 'anuraga'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})

df_r = pd.DataFrame(np.arange(12).reshape((6, 2)),
                       index=[['anuraga', 'anuraga', 'raga', 'raga', 'raga', 'raga'],
                              [2001, 2000, 2000, 2000, 2001, 2002]],
                       columns=['prog1', 'prog2'])

pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='outer')
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='left')
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='right')

Unnamed: 0,key1,key2,data,prog1,prog2
3,anuraga,2001,3.0,0,1
4,anuraga,2000,,2,3
0,raga,2000,0.0,4,5
0,raga,2000,0.0,6,7
1,raga,2001,1.0,8,9
2,raga,2002,2.0,10,11


In [28]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      index=['a', 'c', 'e'],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       index=['b', 'c', 'd', 'e'],
                       columns=['braga', 'sraga'])

pd.merge(df_li, df_ri, how='outer', left_index=True, right_index=True)
pd.merge(df_li, df_ri, how='inner', left_index=True, right_index=True)

Unnamed: 0,raga,anuraga,braga,sraga
c,30,40,90,100
e,50,60,130,140


In [29]:
df_li.join(df_ri, how='outer')

Unnamed: 0,raga,anuraga,braga,sraga
a,10.0,20.0,,
b,,,70.0,80.0
c,30.0,40.0,90.0,100.0
d,,,110.0,120.0
e,50.0,60.0,130.0,140.0


In [30]:
caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                        'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                       'B': ['B0', 'B1', 'B2']})

caller.join(other, lsuffix='_caller', rsuffix='_other') 
caller.join(other, lsuffix='_caller', rsuffix='_other', how='right')

Unnamed: 0,key_caller,A,key_other,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K2,A2,K2,B2


In [31]:
caller.set_index('key').join(other.set_index('key'))

Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


### How To Concatenate DataFrame's Along The Row or Column Axis?

In [32]:
# pd.concat?

In [33]:
import pandas as pd
import numpy as np

ser1 = pd.Series([0, 1], index=['A', 'B'])
ser2 = pd.Series([2, 3, 4], index=['C', 'D', 'E'])
ser3 = pd.Series([5, 6], index=['F', 'G'])

pd.concat([ser1, ser2, ser3])
pd.concat([ser1, ser2, ser3], sort=True) 
pd.concat([ser1, ser2, ser3], axis=1, sort=True) 
pd.concat([ser1, ser2, ser3], axis=1, sort=True, join='inner') 

Unnamed: 0,0,1,2


In [34]:
ser4 = pd.concat([ser1, ser3]) 
pd.concat([ser1, ser4], axis=1, sort=True) 
# pd.concat([ser1, ser4], axis=1, join='inner') 
# pd.concat([ser1, ser4], axis=1, join_axes=[['A', 'B', 'F', 'G']])

Unnamed: 0,0,1
A,0.0,0
B,1.0,1
F,,5
G,,6


In [35]:
idc = pd.concat([ser1, ser2, ser3], axis=0, keys=['1', '2', '3'])
idc

1  A    0
   B    1
2  C    2
   D    3
   E    4
3  F    5
   G    6
dtype: int64

In [36]:
idc.unstack()

Unnamed: 0,A,B,C,D,E,F,G
1,0.0,1.0,,,,,
2,,,2.0,3.0,4.0,,
3,,,,,,5.0,6.0


In [37]:
pd.concat([ser1, ser2, ser3], axis=1, keys=['1', '2', '3'], sort=True)

Unnamed: 0,1,2,3
A,0.0,,
B,1.0,,
C,,2.0,
D,,3.0,
E,,4.0,
F,,,5.0
G,,,6.0


#### Let's see the same logic on DataFrame objects

In [38]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      index=['a', 'c', 'e'],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       index=['b', 'c', 'd', 'e'],
                       columns=['braga', 'sraga'])

pd.concat([df_li, df_ri], axis=1, keys=['one', 'two'], sort=True, join='inner') 
pd.concat([df_li, df_ri], axis=1, keys=['1', '2', '3'], sort=True, join='outer')     
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer')
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer', axis=1) 
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer', axis=1, names=['first', 'second'])

first,level1,level1,level2,level2
second,raga,anuraga,braga,sraga
a,10.0,20.0,,
b,,,70.0,80.0
c,30.0,40.0,90.0,100.0
d,,,110.0,120.0
e,50.0,60.0,130.0,140.0


In [39]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       columns=['braga', 'sraga'])

pd.concat([df_li, df_ri], axis=0, join='outer', ignore_index=True, sort=True)
pd.concat([df_li, df_ri], axis=1, join='outer', ignore_index=True, sort=True)

Unnamed: 0,0,1,2,3
0,10.0,20.0,70,80
1,30.0,40.0,90,100
2,50.0,60.0,110,120
3,,,130,140


### How To Combine Data With Overlap?

In [40]:
# pd.DataFrame.combine_first?
# pd.DataFrame.combine?

In [41]:
import pandas as pd
import numpy as np
df1 = pd.DataFrame([[1, np.nan]])
df2 = pd.DataFrame([[3, 4]])

In [42]:
df1.combine_first(df2)

Unnamed: 0,0,1
0,1,4.0


In [43]:
df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
print('df1:\n', df1)
print('df2:\n', df2)

df1:
    A  B
0  0  4
1  0  4
df2:
    A  B
0  1  3
1  1  3


In [44]:
df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2)

Unnamed: 0,A,B
0,0,3
1,0,3


### How To Reshape and Pivot Pandas Data?
Pandas provides many ways to rearrange the Tabular Data and is known as 'reshape or pivot' operation. Reshaping The Hierarchically indexed DataFrame's

In [45]:
# pd.DataFrame.stack?
# pd.DataFrame.unstack?

In [46]:
import pandas as pd
import numpy as np
df_s = data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                     index=pd.Index(['raga', 'mmraga'], name='state'),
                     columns=pd.Index(['one', 'two', 'three'], name='number'))
df_s

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
raga,0,1,2
mmraga,3,4,5


In [47]:
df_s.stack()
df_s.stack().unstack() # the default level=-1
df_s.stack().unstack(level=0) # the column index is considered to unstack the data
df_s.stack().unstack('state')

state,raga,mmraga
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [48]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'], name='one')
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'], name='two')
data = pd.concat([s1, s2], keys= ['one', 'two'])
data

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [49]:
data.unstack()
data.unstack().stack()
data.unstack().stack(dropna=False)
df_s.stack().unstack('state')

state,raga,mmraga
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5
