In [1]:
import pandas as pd

In [3]:
data = {
    'name':['Jack','Mary','Lily','Tom','Joe', None],
    'age':[18,None,21,25,24,None],
    'score':['A','B','A', None, None, None],
}

df = pd.DataFrame(data, columns=['name','age','score'])
df

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,
5,,,


In [4]:
df.isnull().sum()

name     1
age      2
score    3
dtype: int64

In [5]:
df.dropna()

Unnamed: 0,name,age,score
0,Jack,18.0,A
2,Lily,21.0,A


In [6]:
df.dropna(how='all') # any

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,


In [7]:
df

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,
5,,,


In [8]:
df.dropna(subset=['age'])

Unnamed: 0,name,age,score
0,Jack,18.0,A
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,


In [9]:
df.fillna(0)

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,0.0,B
2,Lily,21.0,A
3,Tom,25.0,0
4,Joe,24.0,0
5,0,0.0,0


In [10]:
df.fillna({'age':18,'score':'D'})

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,18.0,B
2,Lily,21.0,A
3,Tom,25.0,D
4,Joe,24.0,D
5,,18.0,D


In [11]:
df

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,
5,,,


In [12]:
df.fillna(method='ffill') # front

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,18.0,B
2,Lily,21.0,A
3,Tom,25.0,A
4,Joe,24.0,A
5,Joe,24.0,A


In [13]:
df.fillna(method='bfill') # back

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,21.0,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,
5,,,


In [15]:
df.fillna({'age':df.age.mean()})

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,22.0,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,
5,,22.0,


In [16]:
df.fillna({'age':df.age.median()})

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,22.5,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,
5,,22.5,


In [17]:
# 插值
df

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,
5,,,


In [18]:
df.interpolate()

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,19.5,B
2,Lily,21.0,A
3,Tom,25.0,
4,Joe,24.0,
5,,24.0,


In [19]:
# df.interpolate? 这里问号是帮助的意思

In [21]:
df.fillna({'score':'missing'}, limit=1)

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
5,,,


In [22]:
df2 = df.fillna({'score':'missing'}, limit=1)

In [24]:
df2

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
5,,,


In [25]:
df.fillna({'score':'missing'}, limit=1, inplace=True)

In [26]:
df

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
5,,,


# 异常值处理

In [27]:
df.loc[6] = {'name':'test','age':999,'score':'C'}

In [28]:
df

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
5,,,
6,test,999.0,C


In [34]:
q_upper = df['age'].quantile(0.75)
q_upper

25.0

In [35]:
q_lower = df['age'].quantile(0.25)
q_lower

21.0

In [36]:
val = q_upper - q_lower
val

4.0

In [37]:
k = 1.5

In [38]:
df[df['age'] > q_upper + k * val]

Unnamed: 0,name,age,score
6,test,999.0,C


In [39]:
df2 = df.drop(5)

In [40]:
df2

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
6,test,999.0,C


In [41]:
df[(df['age'] < q_upper + k * val) & (df['age'] > q_upper - k * val)]

Unnamed: 0,name,age,score
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,


# 重复值

In [42]:
df

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
5,,,
6,test,999.0,C


In [43]:
df.loc[7] = {'name':'test','age':999,'score':'C'}

In [44]:
df.loc[8] = {'name':'test','age':999,'score':'C'}

In [45]:
df

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
5,,,
6,test,999.0,C
7,test,999.0,C
8,test,999.0,C


In [46]:
df.drop_duplicates()

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
5,,,
6,test,999.0,C


In [47]:
df.drop_duplicates(['age','score'])

Unnamed: 0,name,age,score
0,Jack,18.0,A
1,Mary,,B
2,Lily,21.0,A
3,Tom,25.0,missing
4,Joe,24.0,
5,,,
6,test,999.0,C
