This exercise is based on instructions at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.filter.html

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
                  index=['mouse', 'rabbit'],
                  columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
mouse,1,2,3
rabbit,4,5,6


In [5]:
# Filter columns by names
df[['one','two']]

Unnamed: 0,one,two
mouse,1,2
rabbit,4,5


df.filter(items=['one','three'])

In [9]:
df.filter(like='bbi',axis=0)

Unnamed: 0,one,two,three
rabbit,4,5,6


In [11]:
df.filter(like='t', axis=1)

Unnamed: 0,two,three
mouse,2,3
rabbit,5,6


In [13]:
df['one':'three']

Unnamed: 0,one,two,three
rabbit,4,5,6


In [14]:
df[['one','two']]

Unnamed: 0,one,two
mouse,1,2
rabbit,4,5


### Pandas idxmax and idxmin

Pandas idxmax: return the index of the first occurence of maximum over the requested axis

In [17]:
df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
                   'co2_emissions': [37.2, 19.66, 1712]},
                   index=['Pork', 'Wheat Products', 'Beef'])

In [18]:
df

Unnamed: 0,consumption,co2_emissions
Pork,10.51,37.2
Wheat Products,103.11,19.66
Beef,55.48,1712.0


In [20]:
df['consumption'].idxmax()

'Wheat Products'

In [22]:
df.idxmax()

consumption      Wheat Products
co2_emissions              Beef
dtype: object

In [23]:
df.idxmax(axis=1)

Pork              co2_emissions
Wheat Products      consumption
Beef              co2_emissions
dtype: object

In [24]:
df.idxmax(axis=0)

consumption      Wheat Products
co2_emissions              Beef
dtype: object

In [25]:
df.idxmin(axis=1) # by rows

Pork                consumption
Wheat Products    co2_emissions
Beef                consumption
dtype: object

In [26]:
df.idxmin(axis=0) # by cols

consumption                Pork
co2_emissions    Wheat Products
dtype: object

In [36]:
df.idxmin(axis="columns")

Pork                consumption
Wheat Products    co2_emissions
Beef                consumption
dtype: object

In [33]:
df.idxmin(axis=1) 

Pork                consumption
Wheat Products    co2_emissions
Beef                consumption
dtype: object

In [32]:
df['sum'] = df.sum(axis=1) # by rows

In [30]:
df

Unnamed: 0,consumption,co2_emissions,sum
Pork,10.51,37.2,47.71
Wheat Products,103.11,19.66,122.77
Beef,55.48,1712.0,1767.48


### Set index


In [42]:
df = pd.DataFrame({'month':[1,4,5,10],
                  'year':[2021,2020,2023,2022],
                  'sale':[20,40,50,70]})

In [43]:
df

Unnamed: 0,month,year,sale
0,1,2021,20
1,4,2020,40
2,5,2023,50
3,10,2022,70


In [44]:
df.set_index('month')

Unnamed: 0_level_0,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2021,20
4,2020,40
5,2023,50
10,2022,70


In [46]:
df.set_index(['year','month'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2021,1,20
2020,4,40
2023,5,50
2022,10,70


In [48]:
s = pd.Series([1,2,3,4])
df.set_index([s,s**2])

Unnamed: 0,Unnamed: 1,month,year,sale
1,1,1,2021,20
2,4,4,2020,40
3,9,5,2023,50
4,16,10,2022,70


In [50]:
s = pd.Series(['a','b','c','d'])
df.set_index([s,[1,2,3,4]])

Unnamed: 0,Unnamed: 1,month,year,sale
a,1,1,2021,20
b,2,4,2020,40
c,3,5,2023,50
d,4,10,2022,70


### Replace

In [54]:
s = pd.Series([1,2,1,4,5])
s

0    1
1    2
2    1
3    4
4    5
dtype: int64

In [55]:
s.replace(1,5)

0    5
1    2
2    5
3    4
4    5
dtype: int64

In [56]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
                   'B': [5, 6, 7, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})
df

Unnamed: 0,A,B,C
0,0,5,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,9,e


In [58]:
df.replace(0,5)

Unnamed: 0,A,B,C
0,5,5,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,9,e


In [59]:
df.replace([0,1,2,3],4)

Unnamed: 0,A,B,C
0,4,5,a
1,4,6,b
2,4,7,c
3,4,8,d
4,4,9,e


In [60]:
df.replace([0,1,2,3],[4,3,2,1])

Unnamed: 0,A,B,C
0,4,5,a
1,3,6,b
2,2,7,c
3,1,8,d
4,4,9,e


### Sort values

In [61]:
df = pd.DataFrame({
    'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
    'col2': [2, 1, 9, 8, 7, 4],
    'col3': [0, 1, 9, 4, 2, 3],
    'col4': ['a', 'B', 'c', 'D', 'e', 'F']
})
df

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F


In [62]:
df.sort_values(by='col2')

Unnamed: 0,col1,col2,col3,col4
1,A,1,1,B
0,A,2,0,a
5,C,4,3,F
4,D,7,2,e
3,,8,4,D
2,B,9,9,c


In [64]:
df.sort_values(by='col3', ascending=False)

Unnamed: 0,col1,col2,col3,col4
2,B,9,9,c
3,,8,4,D
5,C,4,3,F
4,D,7,2,e
1,A,1,1,B
0,A,2,0,a


In [66]:
df.sort_values(by=['col2','col1'])

Unnamed: 0,col1,col2,col3,col4
1,A,1,1,B
0,A,2,0,a
5,C,4,3,F
4,D,7,2,e
3,,8,4,D
2,B,9,9,c


In [67]:
# put na first
df.sort_values(by=['col1','col2'],ascending=False,na_position='first')

Unnamed: 0,col1,col2,col3,col4
3,,8,4,D
4,D,7,2,e
5,C,4,3,F
2,B,9,9,c
0,A,2,0,a
1,A,1,1,B


In [68]:
df.sort_values(by='col4', key=lambda x: x.str.lower())

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F


In [71]:
df.sort_values(by=['col1'], key=lambda x: x.str.upper(), ascending=True, na_position='first')

Unnamed: 0,col1,col2,col3,col4
3,,8,4,D
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
5,C,4,3,F
4,D,7,2,e


### Sort index

In [77]:
df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
                  columns=['A'])
df

Unnamed: 0,A
100,1
29,2
234,3
1,4
150,5


In [78]:
df.sort_index()

Unnamed: 0,A
1,4
29,2
100,1
150,5
234,3


In [79]:
df.sort_index(ascending=False)

Unnamed: 0,A
234,3
150,5
100,1
29,2
1,4


In [80]:
df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
df

Unnamed: 0,a
A,1
b,2
C,3
d,4


In [81]:
df.sort_index(key=lambda x: x.str.lower())

Unnamed: 0,a
A,1
b,2
C,3
d,4


### nlargest

In [82]:
df = pd.DataFrame({'population': [59000000, 65000000, 434000,
                                  434000, 434000, 337000, 11300,
                                  11300, 11300],
                   'GDP': [1937894, 2583560 , 12011, 4520, 12128,
                           17036, 182, 38, 311],
                   'alpha-2': ["IT", "FR", "MT", "MV", "BN",
                               "IS", "NR", "TV", "AI"]},
                  index=["Italy", "France", "Malta",
                         "Maldives", "Brunei", "Iceland",
                         "Nauru", "Tuvalu", "Anguilla"])

In [83]:
df

Unnamed: 0,population,GDP,alpha-2
Italy,59000000,1937894,IT
France,65000000,2583560,FR
Malta,434000,12011,MT
Maldives,434000,4520,MV
Brunei,434000,12128,BN
Iceland,337000,17036,IS
Nauru,11300,182,NR
Tuvalu,11300,38,TV
Anguilla,11300,311,AI


In [86]:
df = df.sort_index()
df

Unnamed: 0,population,GDP,alpha-2
Anguilla,11300,311,AI
Brunei,434000,12128,BN
France,65000000,2583560,FR
Iceland,337000,17036,IS
Italy,59000000,1937894,IT
Maldives,434000,4520,MV
Malta,434000,12011,MT
Nauru,11300,182,NR
Tuvalu,11300,38,TV


In [87]:
df.nlargest(3,'population')

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Brunei,434000,12128,BN


In [88]:
df.sort_values(by='population',ascending=False)

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Brunei,434000,12128,BN
Maldives,434000,4520,MV
Malta,434000,12011,MT
Iceland,337000,17036,IS
Anguilla,11300,311,AI
Nauru,11300,182,NR
Tuvalu,11300,38,TV


In [90]:
# IF there is a tie, keep the last one
df.nlargest(3,'population',keep='last')

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Malta,434000,12011,MT


In [91]:
# Keep all of them if there is a tie
df.nlargest(3,'population',keep='all')

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Brunei,434000,12128,BN
Maldives,434000,4520,MV
Malta,434000,12011,MT


In [92]:
df.nlargest(3,['population','GDP'])

Unnamed: 0,population,GDP,alpha-2
France,65000000,2583560,FR
Italy,59000000,1937894,IT
Brunei,434000,12128,BN


### nsmallest

In [93]:
df.nsmallest(4,'GDP',keep='last')

Unnamed: 0,population,GDP,alpha-2
Tuvalu,11300,38,TV
Nauru,11300,182,NR
Anguilla,11300,311,AI
Maldives,434000,4520,MV


In [94]:
df.nsmallest(4,['GDP','population'])

Unnamed: 0,population,GDP,alpha-2
Tuvalu,11300,38,TV
Nauru,11300,182,NR
Anguilla,11300,311,AI
Maldives,434000,4520,MV


### Transpose

In [97]:
d1 = {'col1': [1, 2], 'col2': [3, 4]}
d1

{'col1': [1, 2], 'col2': [3, 4]}

In [99]:
df = pd.DataFrame(data=d1)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [102]:
d_transposed = df.T
d_transposed

Unnamed: 0,0,1
col1,1,2
col2,3,4


### Append

In [104]:
df = pd.DataFrame([[1,2],[3,4]], columns=list('AB'),index=('x','y'))
df

Unnamed: 0,A,B
x,1,2
y,3,4


In [105]:
df2 = pd.DataFrame([[5,6],[7,8]], columns=list('AB'), index=('x','y'))
df2

Unnamed: 0,A,B
x,5,6
y,7,8


In [106]:
df.append(df2)

Unnamed: 0,A,B
x,1,2
y,3,4
x,5,6
y,7,8


In [109]:
new_df = df.append(df2,ignore_index=True)

In [111]:
new_df['C'] = new_df['A']*2
new_df

Unnamed: 0,A,B,C
0,1,2,2
1,3,4,6
2,5,6,10
3,7,8,14


### Compare the difference between 2 dataframes

In [112]:
df = pd.DataFrame(
    {
        "col1": ["a", "a", "b", "b", "a"],
        "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
        "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
    },
    columns=["col1", "col2", "col3"],
)
df

Unnamed: 0,col1,col2,col3
0,a,1.0,1.0
1,a,2.0,2.0
2,b,3.0,3.0
3,b,,4.0
4,a,5.0,5.0


In [113]:
df2 = df.copy()

In [114]:
df2.loc[0,'col1'] = 'c'
df2.loc[2,'col2'] = 4
df2

Unnamed: 0,col1,col2,col3
0,c,1.0,1.0
1,a,2.0,2.0
2,b,4.0,3.0
3,b,,4.0
4,a,5.0,5.0


In [115]:
df.compare(df2)

Unnamed: 0_level_0,col1,col1,col2,col2
Unnamed: 0_level_1,self,other,self,other
0,a,c,,
2,,,3.0,4.0


In [120]:
df.compare(df2, keep_equal=True)

Unnamed: 0_level_0,col1,col1,col2,col2
Unnamed: 0_level_1,self,other,self,other
0,a,c,1.0,1.0
2,b,b,3.0,4.0


In [121]:
df.compare(df2,keep_shape=True)

Unnamed: 0_level_0,col1,col1,col2,col2,col3,col3
Unnamed: 0_level_1,self,other,self,other,self,other
0,a,c,,,,
1,,,,,,
2,,,3.0,4.0,,
3,,,,,,
4,,,,,,


In [123]:
df.compare(df2,keep_shape=True,keep_equal=True)

Unnamed: 0_level_0,col1,col1,col2,col2,col3,col3
Unnamed: 0_level_1,self,other,self,other,self,other
0,a,c,1.0,1.0,1.0,1.0
1,a,a,2.0,2.0,2.0,2.0
2,b,b,3.0,4.0,3.0,3.0
3,b,b,,,4.0,4.0
4,a,a,5.0,5.0,5.0,5.0


In [116]:
df

Unnamed: 0,col1,col2,col3
0,a,1.0,1.0
1,a,2.0,2.0
2,b,3.0,3.0
3,b,,4.0
4,a,5.0,5.0


In [117]:
df.iloc[0,2]

1.0

In [118]:
df.iloc[0:2,0:1]

Unnamed: 0,col1
0,a
1,a


In [119]:
df.loc[0:3,'col1':'col3']

Unnamed: 0,col1,col2,col3
0,a,1.0,1.0
1,a,2.0,2.0
2,b,3.0,3.0
3,b,,4.0


### Join

In [157]:
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df

Unnamed: 0,key,A
0,K0,A0
1,K1,A1
2,K2,A2
3,K3,A3
4,K4,A4
5,K5,A5


In [158]:
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})
other

Unnamed: 0,key,B
0,K0,B0
1,K1,B1
2,K2,B2


In [140]:
df.join(other, lsuffix='_caller',rsuffix='_other')

Unnamed: 0,key_caller,A,key_other,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K2,A2,K2,B2
3,K3,A3,,
4,K4,A4,,
5,K5,A5,,


In [141]:
# join based on 'key' index
df.set_index('key').join(other.set_index('key'))

Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


### Merge

In [142]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [5, 6, 7, 8]})

In [143]:
df1

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5


In [144]:
df2

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [145]:
df1.merge(df2,left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,value_x,rkey,value_y
0,foo,1,foo,5
1,foo,1,foo,8
2,foo,5,foo,5
3,foo,5,foo,8
4,bar,2,bar,6
5,baz,3,baz,7


In [146]:
df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
df1

Unnamed: 0,a,b
0,foo,1
1,bar,2


In [147]:
df2

Unnamed: 0,a,c
0,foo,3
1,baz,4


In [148]:
df1.merge(df2, how='inner', on='a')

Unnamed: 0,a,b,c
0,foo,1,3


In [151]:
df1.merge(df2,how='left',on='a')

Unnamed: 0,a,b,c
0,foo,1,3.0
1,bar,2,


In [152]:
df1.merge(df2,how='cross')

Unnamed: 0,a_x,b,a_y,c
0,foo,1,foo,3
1,foo,1,baz,4
2,bar,2,foo,3
3,bar,2,baz,4
