In [1]:
import numpy as np
import pandas as pd

In [3]:
# 觀察資料中包含None時陣列的dtype
vals1 = np.array([1, None, 2, 3])
vals1

array([1, None, 2, 3], dtype=object)

In [4]:
print(vals1.sum())

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [10]:
# 觀察資料中包含np.nan時陣列的dtype
vals2 = np.array([1, np.nan, 2, 3])
print(vals2.dtype)

# 觀察np.nan的運算結果
print(1 + np.nan)
print(0 * np.nan)

# 觀察np.nan使用聚合函數的結果
print(vals2.sum(), vals2.min(), vals2.max())

# 忽略np.nan的特殊聚合函數
print(np.nansum(vals2),np.nanmin(vals2),np.nanmax(vals2))

float64
nan
nan
nan nan nan
6.0 1.0 3.0


In [13]:
# 確認同時有NaN 與None 時Pandas 會如何處理
y = pd.Series([1, np.nan, 2, None])
print(y)

# 觀察新增一個None 值到陣列中 資料型態的變化
x = pd.Series(range(2), dtype=int)
print(x)

x[0] = None
print(x)

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64
0    0
1    1
dtype: int64
0    NaN
1    1.0
dtype: float64


In [14]:
# 觀察包含三種缺失值表達方式的可空值dtype 會怎麼顯示
print(pd.Series([1, np.nan, 2, None, pd.NA], dtype='Int32'))

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32


In [16]:
data = pd.Series([1, np.nan, 'hello', None])
print(data.isnull())

# 可直接拿來當作索引
print(data[data.notnull()])

0    False
1     True
2    False
3     True
dtype: bool
0        1
2    hello
dtype: object


In [27]:
# 直接使用dropna()
print(data.dropna())

0        1
2    hello
dtype: object
0        1
1      NaN
2    hello
3     None
dtype: object


In [29]:
df = pd.DataFrame([[1, np.nan, 2],
                  [2, 3, 5],
                  [np.nan, 4, 6]])
print(df)

# 預設方式為移除所有出現null值的整列
print(df.dropna())

# 也可以指定不同的資料軸: axis = 1 / axis='columns' 會移除出現null值的整欄
print(df.dropna(axis='columns'))

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6
     0    1  2
1  2.0  3.0  5
   2
0  2
1  5
2  6


In [30]:
# 測試 how='all' 參數
df[3] = np.nan
print(df)

print(df.dropna(axis='columns',how='all'))

# 測試 thresh 參數 只要有三筆以上非空值的列
print(df.dropna(axis='rows', thresh=3))

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN
     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6
     0    1  2   3
1  2.0  3.0  5 NaN


In [35]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'),
                 dtype='Int32')

print(data)

# 填入單一值
print(data.fillna(0))

# 向前填補 複製前一個值
print(data.ffill())

# 向後填補 複製後一個值
print(data.bfill())



a       1
b    <NA>
c       2
d    <NA>
e       3
dtype: Int32
a    1
b    0
c    2
d    0
e    3
dtype: Int32
a    1
b    1
c    2
d    2
e    3
dtype: Int32
a    1
b    2
c    2
d    3
e    3
dtype: Int32


In [37]:
print(df)

# 向前或向後填補 如果沒有值 會保留空值
print(df.ffill())
print(df.bfill())

# 也可以指定沿著哪個軸填補
print(df.ffill(axis=1))

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN
     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  2.0  4.0  6 NaN
     0    1  2   3
0  1.0  3.0  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN
     0    1    2    3
0  1.0  1.0  2.0  2.0
1  2.0  3.0  5.0  5.0
2  NaN  4.0  6.0  6.0
