# Data Wrangling

## Join, Combine and Reshape

## Importance of Hierarchical Indexing

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_hi = pd.Series(np.random.randn(9),
          index=[['A', 'A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
                 [1, 2, 3, 1, 4, 1, 2, 2, 4]])
data_hi

A  1    0.106573
   2   -0.886613
   3    0.054015
B  1   -0.239520
   4    0.492786
C  1   -1.660171
   2   -0.698756
D  2    0.153799
   4   -0.866109
dtype: float64

In [3]:
pd.Series.index?

In [4]:
data_hi.index

MultiIndex([('A', 1),
            ('A', 2),
            ('A', 3),
            ('B', 1),
            ('B', 4),
            ('C', 1),
            ('C', 2),
            ('D', 2),
            ('D', 4)],
           )

In [5]:
data_hi['A']

1    0.106573
2   -0.886613
3    0.054015
dtype: float64

In [6]:
data_hi['A':'C']

A  1    0.106573
   2   -0.886613
   3    0.054015
B  1   -0.239520
   4    0.492786
C  1   -1.660171
   2   -0.698756
dtype: float64

In [7]:
data_hi[['A', 'C']]

A  1    0.106573
   2   -0.886613
   3    0.054015
C  1   -1.660171
   2   -0.698756
dtype: float64

In [8]:
data_hi.loc[:, 1]

A    0.106573
B   -0.239520
C   -1.660171
dtype: float64

In [9]:
pd.Series.unstack?

In [10]:
data_hi.unstack()

Unnamed: 0,1,2,3,4
A,0.106573,-0.886613,0.054015,
B,-0.23952,,,0.492786
C,-1.660171,-0.698756,,
D,,0.153799,,-0.866109


In [11]:
data_hi.unstack(fill_value=0)

Unnamed: 0,1,2,3,4
A,0.106573,-0.886613,0.054015,0.0
B,-0.23952,0.0,0.0,0.492786
C,-1.660171,-0.698756,0.0,0.0
D,0.0,0.153799,0.0,-0.866109


In [12]:
data_hi.unstack().stack()

A  1    0.106573
   2   -0.886613
   3    0.054015
B  1   -0.239520
   4    0.492786
C  1   -1.660171
   2   -0.698756
D  2    0.153799
   4   -0.866109
dtype: float64

In [13]:
df_hi = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['one', 'one', 'three'],
                              ['Green', 'Red', 'Green']])
df_hi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,one,three
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [14]:
pd.names?

Object `pd.names` not found.


In [15]:
#help(pd.MultiIndex)
pd.MultiIndex?

In [16]:
df_hi.index.names = ['val1', 'val2']

In [17]:
df_hi.columns.names = ['number', 'color']

In [18]:
df_hi

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [19]:
df_hi['one']

Unnamed: 0_level_0,color,Green,Red
val1,val2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


### How Reordering and Sorting of Index Levels Takes Place?

In [20]:
import pandas as pd
import numpy as np

In [21]:
pd.DataFrame.swaplevel?

In [22]:
df_hi

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [23]:
df_hi.swaplevel('val1', 'val2', axis=0)

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val2,val1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [24]:
df_hi.swaplevel('number', 'color', axis=1)

Unnamed: 0_level_0,color,Green,Red,Green
Unnamed: 0_level_1,number,one,one,three
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [25]:
pd.DataFrame.sort_index?

In [26]:
df_hi.sort_index(level=1)

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [27]:
df_hi.sort_index(level=0)

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val1,val2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [28]:
df_hi.swaplevel(0, 1).sort_index(level=0) 

Unnamed: 0_level_0,number,one,one,three
Unnamed: 0_level_1,color,Green,Red,Green
val2,val1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### How To Get The Summary Statistics By Level?

In [29]:
import pandas as pd
import numpy as np

In [30]:
pd.DataFrame.sum?

In [31]:
print(df_hi)
df_hi.sum(level='val1') # all 'a' and 'b's are grouped first and then sum is applied

number      one     three
color     Green Red Green
val1 val2                
a    1        0   1     2
     2        3   4     5
b    1        6   7     8
     2        9  10    11


  df_hi.sum(level='val1') # all 'a' and 'b's are grouped first and then sum is applied


number,one,one,three
color,Green,Red,Green
val1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [32]:
print(df_hi)
df_hi.sum(level='val2') # all '1' and '2's are grouped first and then sum is applied

number      one     three
color     Green Red Green
val1 val2                
a    1        0   1     2
     2        3   4     5
b    1        6   7     8
     2        9  10    11


  df_hi.sum(level='val2') # all '1' and '2's are grouped first and then sum is applied


number,one,one,three
color,Green,Red,Green
val2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [33]:
print(df_hi)
df_hi.sum(level='color', axis=1) # in color index level 'Green' are grouped together first and then sum is applied.

number      one     three
color     Green Red Green
val1 val2                
a    1        0   1     2
     2        3   4     5
b    1        6   7     8
     2        9  10    11


  df_hi.sum(level='color', axis=1) # in color index level 'Green' are grouped together first and then sum is applied.


Unnamed: 0_level_0,color,Green,Red
val1,val2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### How To Index With DF's columns?

In [34]:
import pandas as pd
import numpy as np

In [35]:
df_c = pd.DataFrame({'a': range(7), 'b': range(14, 7, -1),
                     'c': ['one', 'one', 'one', 'two', 'two','two', 'two'],
                     'd': [0, 1, 2, 0, 1, 2, 3]})
df_c

Unnamed: 0,a,b,c,d
0,0,14,one,0
1,1,13,one,1
2,2,12,one,2
3,3,11,two,0
4,4,10,two,1
5,5,9,two,2
6,6,8,two,3


In [36]:
pd.DataFrame.set_index?

In [37]:
df_si = df_c.set_index(['c', 'd'])
df_si

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,14
one,1,1,13
one,2,2,12
two,0,3,11
two,1,4,10
two,2,5,9
two,3,6,8


In [38]:
df_c.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,14,one,0
one,1,1,13,one,1
one,2,2,12,one,2
two,0,3,11,two,0
two,1,4,10,two,1
two,2,5,9,two,2
two,3,6,8,two,3


In [39]:
df_si.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,14
1,one,1,1,13
2,one,2,2,12
3,two,0,3,11
4,two,1,4,10
5,two,2,5,9
6,two,3,6,8


### How To Combine and Merge Datasets?

In [40]:
import pandas as pd
import numpy as np

In [41]:
pd.DataFrame.merge?

In [42]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [43]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                   'data2': range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


#### Let's see Database-Style DataFrame Joins

In [44]:
# 'many to one join' 
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,5,1
3,a,2,0
4,a,4,0


In [45]:
df1.merge(df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,5,1
3,a,2,0
4,a,4,0


In [46]:
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,5,1
3,a,2,0
4,a,4,0


In [47]:
df_l = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df_l

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [48]:
df_r = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                   'data2': range(3)})
df_r

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [49]:
print('df_l', df_l, end='\n'), print('df_r', df_r, end='\n')
pd.merge(df_l, df_r, left_on='lkey', right_on='rkey')

df_l   lkey  data1
0    b      0
1    b      1
2    a      2
3    c      3
4    a      4
5    b      5
df_r   rkey  data2
0    a      0
1    b      1
2    d      2


Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,5,b,1
3,a,2,a,0
4,a,4,a,0


In [50]:
pd.merge(df_l, df_r,  left_on='lkey', right_on='rkey', how='outer')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,5.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,c,3.0,,
6,,,d,2.0


In [51]:
# Let's see  'many to many merge' operation

In [52]:
df_m1 = pd.DataFrame({'1key': ['b', 'b', 'a', 'c', 'a', 'b'],
              'data1': range(6)})
df_m1          # 'd' is not present

Unnamed: 0,1key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [53]:
df_m2 = pd.DataFrame({'2key': ['a', 'b', 'a', 'b', 'd'],
               'data2': range(5)})
df_m2           # 'c' is not present

Unnamed: 0,2key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [54]:
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key')                  # let's ommit "how='outer'" or "how='inner'"

Unnamed: 0,1key,data1,2key,data2
0,b,0,b,1
1,b,0,b,3
2,b,1,b,1
3,b,1,b,3
4,b,5,b,1
5,b,5,b,3
6,a,2,a,0
7,a,2,a,2
8,a,4,a,0
9,a,4,a,2


In [55]:
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='outer')     # let's include "how='outer'" or "how='inner'"

Unnamed: 0,1key,data1,2key,data2
0,b,0.0,b,1.0
1,b,0.0,b,3.0
2,b,1.0,b,1.0
3,b,1.0,b,3.0
4,b,5.0,b,1.0
5,b,5.0,b,3.0
6,a,2.0,a,0.0
7,a,2.0,a,2.0
8,a,4.0,a,0.0
9,a,4.0,a,2.0


In [56]:
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='left')      # let's include "how='left'

Unnamed: 0,1key,data1,2key,data2
0,b,0,b,1.0
1,b,0,b,3.0
2,b,1,b,1.0
3,b,1,b,3.0
4,a,2,a,0.0
5,a,2,a,2.0
6,c,3,,
7,a,4,a,0.0
8,a,4,a,2.0
9,b,5,b,1.0


In [57]:
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='right')      # let's include "how='right'

Unnamed: 0,1key,data1,2key,data2
0,a,2.0,a,0
1,a,4.0,a,0
2,b,0.0,b,1
3,b,1.0,b,1
4,b,5.0,b,1
5,a,2.0,a,2
6,a,4.0,a,2
7,b,0.0,b,3
8,b,1.0,b,3
9,b,5.0,b,3


#### Let's see How we can merge with Multiple column 'keys' as names

In [58]:
dfleft = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga'],
                      'key2': ['one', 'two', 'one'],
                      'lval': [1, 2, 3]})

dfright = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga', 'anuraga'],
                       'key2': ['one', 'one', 'one', 'two'],
                       'rval': [4, 5, 6, 7]})
print('dfleft')
print(dfleft)
print()
print('dfright')
print(dfright)

dfleft
      key1 key2  lval
0     raga  one     1
1     raga  two     2
2  anuraga  one     3

dfright
      key1 key2  rval
0     raga  one     4
1     raga  one     5
2  anuraga  one     6
3  anuraga  two     7


In [59]:
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='outer') # 'outer' includes all the values and associated data

Unnamed: 0,key1,key2,lval,rval
0,raga,one,1.0,4.0
1,raga,one,1.0,5.0
2,raga,two,2.0,
3,anuraga,one,3.0,6.0
4,anuraga,two,,7.0


In [60]:
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='inner') # 'inner' includes only the  common values and associated data

Unnamed: 0,key1,key2,lval,rval
0,raga,one,1,4
1,raga,one,1,5
2,anuraga,one,3,6


In [61]:
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='left') # 'left' includes priority for left values and associated data

Unnamed: 0,key1,key2,lval,rval
0,raga,one,1,4.0
1,raga,one,1,5.0
2,raga,two,2,
3,anuraga,one,3,6.0


In [62]:
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='right') # 'right' includes priority for right values and associated data

Unnamed: 0,key1,key2,lval,rval
0,raga,one,1.0,4
1,raga,one,1.0,5
2,anuraga,one,3.0,6
3,anuraga,two,,7


In [63]:
print('dfleft')
print(dfleft)
print()
print('dfright')
print(dfright)
pd.merge(dfleft, dfright, on='key1')

dfleft
      key1 key2  lval
0     raga  one     1
1     raga  two     2
2  anuraga  one     3

dfright
      key1 key2  rval
0     raga  one     4
1     raga  one     5
2  anuraga  one     6
3  anuraga  two     7


Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,raga,one,1,one,4
1,raga,one,1,one,5
2,raga,two,2,one,4
3,raga,two,2,one,5
4,anuraga,one,3,one,6
5,anuraga,one,3,two,7


In [64]:
pd.merge(dfleft, dfright, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,raga,one,1,one,4
1,raga,one,1,one,5
2,raga,two,2,one,4
3,raga,two,2,one,5
4,anuraga,one,3,one,6
5,anuraga,one,3,two,7


### Merging on Row Index

In [65]:
import pandas as pd
import numpy as np

In [66]:
dfril = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga', 'raga', 'adiraga'],
                       'lval': range(5)})

dfrir = pd.DataFrame({'rval': [1, 2]},
                         index = ['raga', 'anuraga'])
print('dfril')
print(dfril)   
print()
print('dfrir')
print(dfrir)

dfril
      key1  lval
0     raga     0
1     raga     1
2  anuraga     2
3     raga     3
4  adiraga     4

dfrir
         rval
raga        1
anuraga     2


In [67]:
pd.merge(dfril, dfrir, left_on='key1', right_index=True)

Unnamed: 0,key1,lval,rval
0,raga,0,1
1,raga,1,1
3,raga,3,1
2,anuraga,2,2


In [68]:
# Let's see what happens if swich on left_index=True
#pd.merge(dfril, dfrir, left_on='key1', left_index=True) 

In [69]:
pd.merge(dfril, dfrir, left_on='key1', right_index=True, how='outer')

Unnamed: 0,key1,lval,rval
0,raga,0,1.0
1,raga,1,1.0
3,raga,3,1.0
2,anuraga,2,2.0
4,adiraga,4,


#### Let's work with hierarchically indexed DataFrames

In [70]:
# In hierachically indexed data the joining is implicitely a multiple key merge 

In [71]:
df_l = pd.DataFrame({'key1': ['raga', 'raga', 'raga', 'anuraga', 'anuraga'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})

df_r = pd.DataFrame(np.arange(12).reshape((6, 2)),
                       index=[['anuraga', 'anuraga', 'raga', 'raga', 'raga', 'raga'],
                              [2001, 2000, 2000, 2000, 2001, 2002]],
                       columns=['prog1', 'prog2'])
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r) # use '\n' at the end of string to print the object or data variable in the next line

df_l:
       key1  key2  data
0     raga  2000   0.0
1     raga  2001   1.0
2     raga  2002   2.0
3  anuraga  2001   3.0
4  anuraga  2002   4.0

df_r:
               prog1  prog2
anuraga 2001      0      1
        2000      2      3
raga    2000      4      5
        2000      6      7
        2001      8      9
        2002     10     11


In [72]:
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True)

df_l:
       key1  key2  data
0     raga  2000   0.0
1     raga  2001   1.0
2     raga  2002   2.0
3  anuraga  2001   3.0
4  anuraga  2002   4.0

df_r:
               prog1  prog2
anuraga 2001      0      1
        2000      2      3
raga    2000      4      5
        2000      6      7
        2001      8      9
        2002     10     11


Unnamed: 0,key1,key2,data,prog1,prog2
0,raga,2000,0.0,4,5
0,raga,2000,0.0,6,7
1,raga,2001,1.0,8,9
2,raga,2002,2.0,10,11
3,anuraga,2001,3.0,0,1


In [73]:
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='outer')

df_l:
       key1  key2  data
0     raga  2000   0.0
1     raga  2001   1.0
2     raga  2002   2.0
3  anuraga  2001   3.0
4  anuraga  2002   4.0

df_r:
               prog1  prog2
anuraga 2001      0      1
        2000      2      3
raga    2000      4      5
        2000      6      7
        2001      8      9
        2002     10     11


Unnamed: 0,key1,key2,data,prog1,prog2
0,raga,2000,0.0,4.0,5.0
0,raga,2000,0.0,6.0,7.0
1,raga,2001,1.0,8.0,9.0
2,raga,2002,2.0,10.0,11.0
3,anuraga,2001,3.0,0.0,1.0
4,anuraga,2002,4.0,,
4,anuraga,2000,,2.0,3.0


In [74]:
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='left')

df_l:
       key1  key2  data
0     raga  2000   0.0
1     raga  2001   1.0
2     raga  2002   2.0
3  anuraga  2001   3.0
4  anuraga  2002   4.0

df_r:
               prog1  prog2
anuraga 2001      0      1
        2000      2      3
raga    2000      4      5
        2000      6      7
        2001      8      9
        2002     10     11


Unnamed: 0,key1,key2,data,prog1,prog2
0,raga,2000,0.0,4.0,5.0
0,raga,2000,0.0,6.0,7.0
1,raga,2001,1.0,8.0,9.0
2,raga,2002,2.0,10.0,11.0
3,anuraga,2001,3.0,0.0,1.0
4,anuraga,2002,4.0,,


In [75]:
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='right')

df_l:
       key1  key2  data
0     raga  2000   0.0
1     raga  2001   1.0
2     raga  2002   2.0
3  anuraga  2001   3.0
4  anuraga  2002   4.0

df_r:
               prog1  prog2
anuraga 2001      0      1
        2000      2      3
raga    2000      4      5
        2000      6      7
        2001      8      9
        2002     10     11


Unnamed: 0,key1,key2,data,prog1,prog2
3,anuraga,2001,3.0,0,1
4,anuraga,2000,,2,3
0,raga,2000,0.0,4,5
0,raga,2000,0.0,6,7
1,raga,2001,1.0,8,9
2,raga,2002,2.0,10,11


In [76]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      index=['a', 'c', 'e'],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       index=['b', 'c', 'd', 'e'],
                       columns=['braga', 'sraga'])
print('df_li\n', df_li)
print()
print('df_ri\n', df_ri)

df_li
    raga  anuraga
a    10       20
c    30       40
e    50       60

df_ri
    braga  sraga
b     70     80
c     90    100
d    110    120
e    130    140


In [77]:
pd.merge(df_li, df_ri, how='outer', left_index=True, right_index=True)

Unnamed: 0,raga,anuraga,braga,sraga
a,10.0,20.0,,
b,,,70.0,80.0
c,30.0,40.0,90.0,100.0
d,,,110.0,120.0
e,50.0,60.0,130.0,140.0


In [78]:
pd.merge(df_li, df_ri, how='inner', left_index=True, right_index=True)

Unnamed: 0,raga,anuraga,braga,sraga
c,30,40,90,100
e,50,60,130,140


In [79]:
pd.DataFrame.join?

In [80]:
df_li.join(df_ri, how='outer')

Unnamed: 0,raga,anuraga,braga,sraga
a,10.0,20.0,,
b,,,70.0,80.0
c,30.0,40.0,90.0,100.0
d,,,110.0,120.0
e,50.0,60.0,130.0,140.0


In [81]:
caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                        'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                       'B': ['B0', 'B1', 'B2']})
print('caller:\n', caller)
print()
print('other:\n', other)

caller:
   key   A
0  K0  A0
1  K1  A1
2  K2  A2
3  K3  A3
4  K4  A4
5  K5  A5

other:
   key   B
0  K0  B0
1  K1  B1
2  K2  B2


In [82]:
caller.join(other, lsuffix='_caller', rsuffix='_other') 

Unnamed: 0,key_caller,A,key_other,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K2,A2,K2,B2
3,K3,A3,,
4,K4,A4,,
5,K5,A5,,


In [83]:
caller.join(other, lsuffix='_caller', rsuffix='_other', how='right')

Unnamed: 0,key_caller,A,key_other,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K2,A2,K2,B2


In [84]:
caller.set_index('key').join(other.set_index('key'))

Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


In [85]:
#caller.join(other.set_index('key'), on='key')

### How To Concatenate DataFrame's Along The Row or Column Axis?

In [86]:
import pandas as pd
import numpy as np

In [87]:
pd.concat?

In [88]:
ser1 = pd.Series([0, 1], index=['A', 'B'])

ser2 = pd.Series([2, 3, 4], index=['C', 'D', 'E'])

ser3 = pd.Series([5, 6], index=['F', 'G'])
print(ser1); print(ser2); print(ser3)

A    0
B    1
dtype: int64
C    2
D    3
E    4
dtype: int64
F    5
G    6
dtype: int64


In [89]:
pd.concat([ser1, ser2, ser3])

A    0
B    1
C    2
D    3
E    4
F    5
G    6
dtype: int64

In [90]:
pd.concat([ser1, ser2, ser3], sort=True) 

A    0
B    1
C    2
D    3
E    4
F    5
G    6
dtype: int64

In [91]:
print(ser1); print(ser2); print(ser3)
pd.concat([ser1, ser2, ser3], axis=1, sort=True) 

A    0
B    1
dtype: int64
C    2
D    3
E    4
dtype: int64
F    5
G    6
dtype: int64


Unnamed: 0,0,1,2
A,0.0,,
B,1.0,,
C,,2.0,
D,,3.0,
E,,4.0,
F,,,5.0
G,,,6.0


In [92]:
pd.concat([ser1, ser2, ser3], axis=1, sort=True, join='inner') 

Unnamed: 0,0,1,2


In [93]:
ser4 = pd.concat([ser1, ser3]) 
ser4

A    0
B    1
F    5
G    6
dtype: int64

In [94]:
print(ser1); print(ser4)
pd.concat([ser1, ser4], axis=1, sort=True) 

A    0
B    1
dtype: int64
A    0
B    1
F    5
G    6
dtype: int64


Unnamed: 0,0,1
A,0.0,0
B,1.0,1
F,,5
G,,6


In [95]:
print(ser1); print(ser4)
pd.concat([ser1, ser4], axis=1, join='inner') 

A    0
B    1
dtype: int64
A    0
B    1
F    5
G    6
dtype: int64


Unnamed: 0,0,1
A,0,0
B,1,1


In [96]:
pd.concat([ser1, ser4], axis=1, join_axes=[['A', 'B', 'F', 'G']])

TypeError: concat() got an unexpected keyword argument 'join_axes'

In [None]:
idc = pd.concat([ser1, ser2, ser3], axis=0, keys=['1', '2', '3'])
idc

In [None]:
idc.unstack()

In [None]:
pd.concat([ser1, ser2, ser3], axis=1, keys=['1', '2', '3'], sort=True)

#### Let's see the same logic on DataFrame objects

In [None]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      index=['a', 'c', 'e'],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       index=['b', 'c', 'd', 'e'],
                       columns=['braga', 'sraga'])
print('df_li\n', df_li)
print()
print('df_ri\n', df_ri)

In [None]:
pd.concat([df_li, df_ri], axis=1, keys=['one', 'two'], sort=True, join='inner') 

In [None]:
pd.concat([df_li, df_ri], axis=1, keys=['1', '2', '3'], sort=True, join='outer')            

In [None]:
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer')

In [None]:
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer', axis=1) 

In [None]:
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer', axis=1, names=['first', 'second'])

In [None]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       columns=['braga', 'sraga'])
print('df_li\n', df_li)
print()
print('df_ri\n', df_ri)

In [None]:
pd.concat([df_li, df_ri], axis=0, join='outer', ignore_index=True, sort=True)

In [None]:
pd.concat([df_li, df_ri], axis=1, join='outer', ignore_index=True, sort=True)

### How To Combine Data With Overlap?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.combine_first?

In [None]:
df1 = pd.DataFrame([[1, np.nan]])
df2 = pd.DataFrame([[3, 4]])
print('df1:\n', df1)
print('df2:\n', df2)

In [None]:
df1.combine_first(df2)

In [None]:
pd.DataFrame.combine?

In [None]:
df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
print('df1:\n', df1)
print('df2:\n', df2)

In [None]:
df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2)

### How To Reshape and Pivot Pandas Data?

In [None]:
# Pandas provides many ways to rearrange the Tabular Data and is known as 'reshape or pivot' operation

#### Reshaping The Hierarchically indexed DataFrame's

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.stack?

In [None]:
df_s = data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                     index=pd.Index(['raga', 'mmraga'], name='state'),
                     columns=pd.Index(['one', 'two', 'three'], name='number'))
df_s

In [None]:
df_s.stack()

In [None]:
pd.DataFrame.unstack?

In [None]:
df_s.stack().unstack() # the default level=-1

In [None]:
df_s.stack().unstack(level=0) # the column index is considered to unstack the data

In [None]:
df_s.stack().unstack('state')

In [None]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'], name='one')
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'], name='two')
data = pd.concat([s1, s2], keys= ['one', 'two'])
data

In [None]:
data.unstack()

In [None]:
data.unstack().stack()

In [None]:
data.unstack().stack(dropna=False)

In [None]:
df_s.stack().unstack('state')