Exercises based on instruction at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html

In [1]:
import pandas as pd

In [53]:
import numpy as np
import matplotlib.pyplot as plt

## Set_index

In [29]:
df = pd.DataFrame({'month':[1,2,4,7],
                 'year':[2020,2021,2022,2022],
                  'sale':[100,200,220,330]})

In [30]:
df

Unnamed: 0,month,year,sale
0,1,2020,100
1,2,2021,200
2,4,2022,220
3,7,2022,330


In [31]:
df.describe()

Unnamed: 0,month,year,sale
count,4.0,4.0,4.0
mean,3.5,2021.25,212.5
std,2.645751,0.957427,94.295634
min,1.0,2020.0,100.0
25%,1.75,2020.75,175.0
50%,3.0,2021.5,210.0
75%,4.75,2022.0,247.5
max,7.0,2022.0,330.0


In [32]:
# Set index to be the month column
df.set_index('month')

Unnamed: 0_level_0,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2020,100
2,2021,200
4,2022,220
7,2022,330


In [33]:
df.set_index('sale')

Unnamed: 0_level_0,month,year
sale,Unnamed: 1_level_1,Unnamed: 2_level_1
100,1,2020
200,2,2021
220,4,2022
330,7,2022


In [34]:
df.set_index('year')

Unnamed: 0_level_0,month,sale
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2020,1,100
2021,2,200
2022,4,220
2022,7,330


In [35]:
# create a multiindex using column year and month
df.set_index(['year','month'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2020,1,100
2021,2,200
2022,4,220
2022,7,330


In [36]:
# create a multiindex using column year and sale
df.set_index(['year','sale'])

Unnamed: 0_level_0,Unnamed: 1_level_0,month
year,sale,Unnamed: 2_level_1
2020,100,1
2021,200,2
2022,220,4
2022,330,7


In [42]:
# Create an multiindex using an index and a column
df.set_index([pd.Index([1,2,3,4]),'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,month,sale
Unnamed: 0_level_1,year,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2020,1,100
2,2021,2,200
3,2022,4,220
4,2022,7,330


In [43]:
df.set_index([pd.Index([3,4,5,6]), 'month'])

Unnamed: 0_level_0,Unnamed: 1_level_0,year,sale
Unnamed: 0_level_1,month,Unnamed: 2_level_1,Unnamed: 3_level_1
3,1,2020,100
4,2,2021,200
5,4,2022,220
6,7,2022,330


In [44]:
# Create a multiindex using 2 series
s = pd.Series([1,2,3,4])

In [45]:
df.set_index([s,s*2])

Unnamed: 0,Unnamed: 1,month,year,sale
1,2,1,2020,100
2,4,2,2021,200
3,6,4,2022,220
4,8,7,2022,330


## DataFrame

In [47]:
pd.DataFrame({'col1':[1,2], 'col2':[3,4]})

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [49]:
d = {'col1':[1,2], 'col2':[3,4]}
df = pd.DataFrame(data=d)

In [50]:
df.dtypes

col1    int64
col2    int64
dtype: object

In [58]:
# To enforce the data types to int8
df = pd.DataFrame(data=d, dtype=np.int8)

In [59]:
df.dtypes

col1    int8
col2    int8
dtype: object

In [60]:
pd.Series([2,3], index=[2,3])

2    2
3    3
dtype: int64

In [62]:
d = {'col1':[0,1,2,3], 'col2':pd.Series([2,3], index=[2,3])}
df = pd.DataFrame(data=d, index=[1,2,3,4])

In [64]:
df

Unnamed: 0,col1,col2
1,0,
2,1,2.0
3,2,3.0
4,3,


In [70]:
# Constructing dataframe from numpy array
df2 = pd.DataFrame(data = np.array([[1,2,3],[4,5,6],[7,8,9]]), columns = ['x','y','z'], index=['a','b','c'])
df2

Unnamed: 0,x,y,z
a,1,2,3
b,4,5,6
c,7,8,9


In [71]:
df3 = pd.DataFrame(np.array([[1,2,3],[4,5,6],[0,9,2]]), columns=['march','april','may'])

In [72]:
df3

Unnamed: 0,march,april,may
0,1,2,3
1,4,5,6
2,0,9,2


In [80]:
# Constructing dataframe from numpy array that has labelled columns
data = np.array([(1,2,3),(4,5,6),(7,8,9)], dtype=[('a','i4'),('b','i4'),('c','i4')])
df4 = pd.DataFrame(data, columns = ['a','b'])

In [81]:
df4

Unnamed: 0,a,b
0,1,2
1,4,5
2,7,8


In [78]:
df5 = pd.DataFrame(np.array([(4,4,5),(5,6,7),(6,7,8)]), columns=['a','b','c'])

In [79]:
df5

Unnamed: 0,a,b,c
0,4,4,5
1,5,6,7
2,6,7,8


In [83]:
df6 = pd.DataFrame(np.array([[1.2,3.4],[2.3,4.5]]), dtype=np.int8)

In [84]:
df6

Unnamed: 0,0,1
0,1,3
1,2,4


### DataFrame.at

In [85]:
df = pd.DataFrame([[0,2,3],[0,4,1],[10,20,30]], index=[4,5,6], columns=['sale1','sale2','sale3'])

In [86]:
df

Unnamed: 0,sale1,sale2,sale3
4,0,2,3
5,0,4,1
6,10,20,30


In [87]:
df.at[4,'sale2']

2

In [88]:
df.at[4,'sale2'] = 14

In [89]:
df

Unnamed: 0,sale1,sale2,sale3
4,0,14,3
5,0,4,1
6,10,20,30


In [92]:
df.loc[5].at['sale1']

0

### DataFrame.head

In [96]:
df = pd.DataFrame({'animals':['lion','zebra','squirel','chicken','monkey','fish','shark','octopus','pig','cow']})

In [97]:
df.head()

Unnamed: 0,animals
0,lion
1,zebra
2,squirel
3,chicken
4,monkey


In [98]:
df.head(3)

Unnamed: 0,animals
0,lion
1,zebra
2,squirel


In [99]:
df.head(-2)

Unnamed: 0,animals
0,lion
1,zebra
2,squirel
3,chicken
4,monkey
5,fish
6,shark
7,octopus


### DataFrame.loc

In [100]:
df = pd.DataFrame([[1,2],[3,4],[7,8]], index=['cobra','viper','sidewinder'], columns=['max_speed','shield'])

In [101]:
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,3,4
sidewinder,7,8


In [102]:
df.loc['viper']

max_speed    3
shield       4
Name: viper, dtype: int64

In [103]:
df.loc[['viper','sidewinder']]

Unnamed: 0,max_speed,shield
viper,3,4
sidewinder,7,8


In [104]:
df.loc['viper','shield']

4

In [105]:
df.loc['cobra':'sidewinder','max_speed']

cobra         1
viper         3
sidewinder    7
Name: max_speed, dtype: int64

In [106]:
df.loc[['cobra','viper'],'shield']

cobra    2
viper    4
Name: shield, dtype: int64

In [110]:
df.loc[[True, False, False], [True, False]]

Unnamed: 0,max_speed
cobra,1


In [111]:
df.loc[df['shield'] > 6]

Unnamed: 0,max_speed,shield
sidewinder,7,8


In [112]:
df.loc[df['shield'] > 7, 'shield']

sidewinder    8
Name: shield, dtype: int64

In [113]:
df.loc[df['shield'] > 7, ['shield']]

Unnamed: 0,shield
sidewinder,8


In [116]:
df.loc[df['shield'] == 8, ['max_speed']]

Unnamed: 0,max_speed
sidewinder,7


In [117]:
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,3,4
sidewinder,7,8


In [118]:
# Setting values
df.loc[['cobra','viper'],['shield']] = 50

In [119]:
df

Unnamed: 0,max_speed,shield
cobra,1,50
viper,3,50
sidewinder,7,8


In [120]:
df.loc[['cobra','viper'],'shield'] = 40

In [121]:
df

Unnamed: 0,max_speed,shield
cobra,1,40
viper,3,40
sidewinder,7,8


In [122]:
df.loc['cobra'] = 10
df

Unnamed: 0,max_speed,shield
cobra,10,10
viper,3,40
sidewinder,7,8


In [123]:
df.loc[['cobra','viper']] = 10

In [124]:
df

Unnamed: 0,max_speed,shield
cobra,10,10
viper,10,10
sidewinder,7,8


In [125]:
# Set values for entire row
df.loc['cobra','shield'] = 5

In [126]:
df

Unnamed: 0,max_speed,shield
cobra,10,5
viper,10,10
sidewinder,7,8


In [127]:
# Set values for entire column
df.loc[:,'shield'] = 1
df

Unnamed: 0,max_speed,shield
cobra,10,1
viper,10,1
sidewinder,7,1


In [133]:
# Set values for rows that matched a condition
df.loc[(df['max_speed'] > 9)| (df['shield'] > 5)] = 4

In [134]:
df

Unnamed: 0,max_speed,shield
cobra,4,4
viper,4,4
sidewinder,7,1


In [137]:
df.loc[(df['max_speed']== 7) & (df['shield'] == 0)] = 3
df

Unnamed: 0,max_speed,shield
cobra,4,4
viper,4,4
sidewinder,7,1


In [138]:
df = pd.DataFrame([[1,2],[3,4],[5,6]], index=[0,1,2],columns =['max_speed','shield'])

In [139]:
df

Unnamed: 0,max_speed,shield
0,1,2
1,3,4
2,5,6


In [140]:
df.iloc[0:1]

Unnamed: 0,max_speed,shield
0,1,2


In [145]:
# Getting values with MultiIndex
tuples = [('cobra','mark i'), ('cobra','mark ii'), ('sidewinder','mark i'), ('sidewinder','mark ii'),
         ('viper','mark i'), ('viper','mark ii')]
index = pd.MultiIndex.from_tuples(tuples)
values = [[12,2],[0,4],[10,20],[1,4],[7,1],[16,36]]
df = pd.DataFrame(values, columns = ['max_speed','shield'], index = index)

In [144]:
df

Unnamed: 0,Unnamed: 1,max_speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark ii,7,1
viper,mark ii,16,36


In [146]:
df.loc['cobra']

Unnamed: 0,max_speed,shield
mark i,12,2
mark ii,0,4


In [147]:
df.loc[('cobra','mark i')]

max_speed    12
shield        2
Name: (cobra, mark i), dtype: int64

In [148]:
df.loc['cobra','mark ii']

max_speed    0
shield       4
Name: (cobra, mark ii), dtype: int64

In [149]:
df.loc[[('cobra','mark i')]]

Unnamed: 0,Unnamed: 1,max_speed,shield
cobra,mark i,12,2


In [150]:
df.loc[('cobra','mark i'), 'shield']

2

In [151]:
# Slicing
df.loc['cobra':'viper']

Unnamed: 0,Unnamed: 1,max_speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark i,7,1
viper,mark ii,16,36


In [152]:
df.loc[('cobra','mark i'):('viper','mark i')]

Unnamed: 0,Unnamed: 1,max_speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark i,7,1


### Pandas iloc

In [153]:
df = pd.DataFrame({
    'a':[1,2,3,4],
    'b':[3,4,5,6],
    'c':[7,8,9,3],
})

In [154]:
df

Unnamed: 0,a,b,c
0,1,3,7
1,2,4,8
2,3,5,9
3,4,6,3


In [156]:
myDic = [{'a':1,'b':2,'c':3,'d':4},
         {'a':10,'b':20,'c':30,'d':40},
         {'a':100,'b':200,'c':300,'d':400}]

In [157]:
df = pd.DataFrame(myDic)

In [158]:
df

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,10,20,30,40
2,100,200,300,400


In [165]:
type(df.iloc[0])

pandas.core.series.Series

In [167]:
df.iloc[0]

a    1
b    2
c    3
d    4
Name: 0, dtype: int64

In [166]:
type(df.iloc[[0]])

pandas.core.frame.DataFrame

In [168]:
df.iloc[[0]]

Unnamed: 0,a,b,c,d
0,1,2,3,4


In [163]:
df.iloc[:,0]

0      1
1     10
2    100
Name: a, dtype: int64

In [173]:
df.iloc[[0,1],0:2]

Unnamed: 0,a,b
0,1,2
1,10,20


In [176]:
df.iloc[:,0:2]

Unnamed: 0,a,b
0,1,2
1,10,20
2,100,200


In [177]:
df.iloc[:3]

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,10,20,30,40
2,100,200,300,400


In [178]:
df.iloc[[True,False,True]]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,100,200,300,400


In [181]:
df.iloc[lambda x: x.index%2 ==0]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,100,200,300,400


In [182]:
df.iloc[lambda x: x.index % 2 != 0]

Unnamed: 0,a,b,c,d
1,10,20,30,40


In [183]:
# Index both rows and columns
df.iloc[0,1]

2

In [184]:
df.iloc[[0,1],[0,1]]

Unnamed: 0,a,b
0,1,2
1,10,20


In [186]:
df.iloc[1:3,0:3]

Unnamed: 0,a,b,c
1,10,20,30
2,100,200,300


In [188]:
df.iloc[:,[True,True,False,False]]

Unnamed: 0,a,b
0,1,2
1,10,20
2,100,200


In [189]:
df.insert(0,'col0',pd.Series([1,3,4]))

In [190]:
df

Unnamed: 0,col0,a,b,c,d
0,1,1,2,3,4
1,3,10,20,30,40
2,4,100,200,300,400


### items

In [191]:
df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
                  'population': [1864, 22000, 80000]},
                  index=['panda', 'polar', 'koala'])

In [192]:
df

Unnamed: 0,species,population
panda,bear,1864
polar,bear,22000
koala,marsupial,80000


In [199]:
for label, content in df.items():
    print(f'label:{label}')
    print(f'content:{content}', sep='\n')

label:species
content:panda         bear
polar         bear
koala    marsupial
Name: species, dtype: object
label:population
content:panda     1864
polar    22000
koala    80000
Name: population, dtype: int64


In [200]:
df.keys()

Index(['species', 'population'], dtype='object')

In [201]:
df.items()

<generator object DataFrame.items at 0x7f94be17a2d0>

In [205]:
dic = {1:2,2:3,3:4,4:5}
for k, v in dic.items():
    print(k,v)

1 2
2 3
3 4
4 5


In [206]:
df.iterrows()

<generator object DataFrame.iterrows at 0x7f94be17a150>

In [207]:
df

Unnamed: 0,species,population
panda,bear,1864
polar,bear,22000
koala,marsupial,80000


In [208]:
df.insert(2,'average',[23,34,56])

In [209]:
df

Unnamed: 0,species,population,average
panda,bear,1864,23
polar,bear,22000,34
koala,marsupial,80000,56


In [210]:
df[df['population'] > df['average']]

Unnamed: 0,species,population,average
panda,bear,1864,23
polar,bear,22000,34
koala,marsupial,80000,56


### agg function

In [215]:
df = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [7,8,9],
                  [np.nan,np.nan,np.nan]], columns = ['A','B',"C"], index=[1,2,3,4])

In [216]:
df

Unnamed: 0,A,B,C
1,1.0,2.0,3.0
2,4.0,5.0,6.0
3,7.0,8.0,9.0
4,,,


In [217]:
df.agg({'A':['sum','average'], 'B':['sum','min'], 'C':['max','median']})

Unnamed: 0,A,B,C
sum,12.0,15.0,
average,,,
min,,2.0,
max,,,9.0
median,,,6.0


In [218]:
df.agg(['min','max','sum'])

Unnamed: 0,A,B,C
min,1.0,2.0,3.0
max,7.0,8.0,9.0
sum,12.0,15.0,18.0


In [219]:
df.agg('sum', axis=0)

A    12.0
B    15.0
C    18.0
dtype: float64

In [220]:
df.agg('sum',axis=1)

1     6.0
2    15.0
3    24.0
4     0.0
dtype: float64

In [222]:
df.agg(x=('A','mean'), y=('B','min'), z=('C','sum'))

Unnamed: 0,A,B,C
x,4.0,,
y,,2.0,
z,,,18.0


In [223]:
df.agg({'A':['mean','max'], 'C':['sum','average']})

Unnamed: 0,A,C
mean,4.0,
max,7.0,
sum,,18.0
average,,


In [226]:
# Transfrom
df = pd.DataFrame({'col1':range(4), 'col2':range(3,7)})

In [227]:
df

Unnamed: 0,col1,col2
0,0,3
1,1,4
2,2,5
3,3,6


In [228]:
df.transform(lambda x: x+1)

Unnamed: 0,col1,col2
0,1,4
1,2,5
2,3,6
3,4,7


In [229]:
df.transform(lambda x: x//3)

Unnamed: 0,col1,col2
0,0,1
1,0,1
2,0,1
3,1,2


In [230]:
df.transform(lambda x: x*9 - 2)

Unnamed: 0,col1,col2
0,-2,25
1,7,34
2,16,43
3,25,52


In [231]:
s = pd.Series(range(10))
s.transform([np.sqrt, np.exp])

Unnamed: 0,sqrt,exp
0,0.0,1.0
1,1.0,2.718282
2,1.414214,7.389056
3,1.732051,20.085537
4,2.0,54.59815
5,2.236068,148.413159
6,2.44949,403.428793
7,2.645751,1096.633158
8,2.828427,2980.957987
9,3.0,8103.083928


In [233]:
df = pd.DataFrame({
    "Date": [
        "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05",
        "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"],
    "Data": [5, 8, 6, 1, 50, 100, 60, 120],
})

In [234]:
df.groupby('Date')['Data'].sum()

Date
2015-05-05    121
2015-05-06     66
2015-05-07    108
2015-05-08     55
Name: Data, dtype: int64

In [241]:
df.groupby('Date')['Data'].transform('sum')

0     55
1    108
2     66
3    121
4     55
5    108
6     66
7    121
Name: Data, dtype: int64

In [236]:
df.groupby('Date')['Data'].mean()

Date
2015-05-05    60.5
2015-05-06    33.0
2015-05-07    54.0
2015-05-08    27.5
Name: Data, dtype: float64

In [240]:
df.groupby('Date')['Data'].min()

Date
2015-05-05    1
2015-05-06    6
2015-05-07    8
2015-05-08    5
Name: Data, dtype: int64

In [242]:
df = pd.DataFrame({
    "c": [1, 1, 1, 2, 2, 2, 2],
    "type": ["m", "n", "o", "m", "m", "n", "n"]
})

In [243]:
df

Unnamed: 0,c,type
0,1,m
1,1,n
2,1,o
3,2,m
4,2,m
5,2,n
6,2,n


In [244]:
df.groupby('type')['c'].transform('sum')

0    5
1    5
2    1
3    5
4    5
5    5
6    5
Name: c, dtype: int64

In [248]:
df['Size'] = df.groupby('c')['type'].transform(len)
df

Unnamed: 0,c,type,Size
0,1,m,3
1,1,n,3
2,1,o,3
3,2,m,4
4,2,m,4
5,2,n,4
6,2,n,4


In [249]:
# Group by
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})
df

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [251]:
df.groupby("Animal").mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,375.0
Parrot,25.0


In [253]:
arrays = [['Falcon','Falcon','Parrot','Parrot'],
         ['Captive','Wild','Captive','Wild']]
index = pd.MultiIndex.from_arrays(arrays, names=('Animal','Type'))
df = pd.DataFrame({'Max Speed':[390, 350, 30, 30]}, index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Max Speed
Animal,Type,Unnamed: 2_level_1
Falcon,Captive,390
Falcon,Wild,350
Parrot,Captive,30
Parrot,Wild,30


In [254]:
df.groupby(level=0).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,370
Parrot,30


In [255]:
df.groupby(level=1).mean()

Unnamed: 0_level_0,Max Speed
Type,Unnamed: 1_level_1
Captive,210
Wild,190


In [None]:
l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
df = pd.DataFrame(l, columns=["a", "b", "c"])

In [256]:
l = [[1,2,3],[1,None,4],[2,1,3],[1,2,2]]
df = pd.DataFrame(l, columns=['a','b','c'])

In [257]:
df

Unnamed: 0,a,b,c
0,1,2.0,3
1,1,,4
2,2,1.0,3
3,1,2.0,2


In [258]:
df.groupby(by='b', dropna=True).sum()

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5


In [259]:
df.groupby(by='a',dropna=False).sum()

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.0,9
2,1.0,3


In [260]:
df['a'].value_counts()

1    3
2    1
Name: a, dtype: int64

In [261]:
df['c'].max()

4

In [262]:
df['b'].min()

1.0

In [263]:
df['c'].unique()

array([3, 4, 2])

In [265]:
# Value counts
df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
                   'num_wings': [2, 0, 0, 0]},
                  index=['falcon', 'dog', 'cat', 'ant'])
df

Unnamed: 0,num_legs,num_wings
falcon,2,2
dog,4,0
cat,4,0
ant,6,0


In [266]:
df.value_counts()

num_legs  num_wings
4         0            2
2         2            1
6         0            1
dtype: int64

In [269]:
df.value_counts(ascending=False)

num_legs  num_wings
4         0            2
2         2            1
6         0            1
dtype: int64

In [270]:
df.value_counts(sort=False)

num_legs  num_wings
2         2            1
4         0            2
6         0            1
dtype: int64

### Drop

In [276]:
df = pd.DataFrame(np.arange(20).reshape(4,5), columns = ['A','B','C','D','E'])
df

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [277]:
df.drop(['B','C'], axis=1)

Unnamed: 0,A,D,E
0,0,3,4
1,5,8,9
2,10,13,14
3,15,18,19


In [278]:
df.drop(0, axis=0)

Unnamed: 0,A,B,C,D,E
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [279]:
df.drop(columns=['D','E'])

Unnamed: 0,A,B,C
0,0,1,2
1,5,6,7
2,10,11,12
3,15,16,17


In [281]:
df.drop([0,1])

Unnamed: 0,A,B,C,D,E
2,10,11,12,13,14
3,15,16,17,18,19


In [282]:
df.drop(index=2)

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
3,15,16,17,18,19


In [283]:
df.drop(index = [0,3])

Unnamed: 0,A,B,C,D,E
1,5,6,7,8,9
2,10,11,12,13,14


In [284]:
df.drop(index=[1,3])

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
2,10,11,12,13,14


In [286]:
df.drop(["B","E"],axis=1)

Unnamed: 0,A,C,D
0,0,2,3
1,5,7,8
2,10,12,13
3,15,17,18


### Drop duplicates

In [287]:
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})
df

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [288]:
df.drop_duplicates()

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [289]:
# Remove duplicates based on specific columns, use 'subset'
df.drop_duplicates(subset='brand')

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5


In [291]:
df.drop_duplicates(subset='style')

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
3,Indomie,pack,15.0


In [292]:
# Remove duplicates but keep the last occurence
df.drop_duplicates(subset=['brand','style'], keep='last')

Unnamed: 0,brand,style,rating
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
4,Indomie,pack,5.0
