# 欠損値

## 欠損値の基礎知識
### データ型

In [1]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(1)
float_ser = pd.Series(rng.random(4), index=range(0, 8, 2)).reindex(range(4))
float_ser

0    0.511822
1         NaN
2    0.950464
3         NaN
dtype: float64

In [2]:
dt_ser = pd.Series(
    pd.date_range("2023-01-01", periods=4),
    index=range(0, 8, 2),
).reindex(range(4))
dt_ser

0   2023-01-01
1          NaT
2   2023-01-02
3          NaT
dtype: datetime64[ns]

In [3]:
int_ser = pd.Series(
    rng.integers(0, 10, 4),
    index=range(0, 8, 2),
    dtype=pd.Int64Dtype(),
).reindex(range(4))
int_ser

0       2
1    <NA>
2       3
3    <NA>
dtype: Int64

In [4]:
int_ser.dtype

Int64Dtype()

### データの型変換

In [5]:
# int型
pd.Series([1, None, 3])

0    1.0
1    NaN
2    3.0
dtype: float64

In [6]:
# bool型
pd.Series([True, None, False])

0     True
1     None
2    False
dtype: object

In [7]:
# float型
pd.Series([1.0, None, 3.0])

0    1.0
1    NaN
2    3.0
dtype: float64

In [8]:
# object型
pd.Series(["a", None, "c"])

0       a
1    None
2       c
dtype: object

### 欠損値を含むデータの評価

In [9]:
np.nan == np.nan

False

In [10]:
np.nan > np.nan

False

In [11]:
pd.NaT == pd.NaT

False

In [12]:
pd.NA == pd.NA

<NA>

In [13]:
pd.isna(float_ser)
# or
pd.isnull(float_ser)
# or
float_ser.isna()
# or
float_ser.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [14]:
int_ser.isna()

0    False
1     True
2    False
3     True
dtype: bool

In [15]:
dt_ser.isna()

0    False
1     True
2    False
3     True
dtype: bool

### 欠損値を含むデータの演算

In [16]:
int_ser.sum()

np.int64(5)

In [17]:
int_ser.cumsum()

0       2
1    <NA>
2       5
3    <NA>
dtype: Int64

In [18]:
int_ser.sum(skipna=False)

<NA>

## 欠損値の発生パターン（メカニズム）と対処方法

### 欠損値の確認

In [19]:
df = pd.read_parquet("data/penguins.parquet")

print(df.isna().sum())

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

### 欠損値の発生が完全にランダム(MCAR)な場合

In [20]:
print(df.shape)

NameError: name 'df' is not defined

In [21]:
print(df.dropna(subset=["Culmen_Length"]).shape)

NameError: name 'df' is not defined

In [22]:
print(df.dropna(subset=["Culmen_Length", "Sex"]).shape)

NameError: name 'df' is not defined

In [23]:
print(df.dropna().shape)

NameError: name 'df' is not defined

In [24]:
print(df.dropna(axis=1).shape)

NameError: name 'df' is not defined

In [25]:
print(
    df.dropna(
        subset=["Culmen_Length", "Sex"],
        how="all",
    ).shape
)

NameError: name 'df' is not defined

In [26]:
df.loc[:, "Culmen_Length"].dropna()

NameError: name 'df' is not defined

### 欠損値の発生が何らかの原因による場合
#### 単変量補完と多変量補完

In [27]:
# 種ごとの平均体重の確認
df.groupby("Species_short")["Body_Mass"].mean()

NameError: name 'df' is not defined

In [28]:
# 種ごとの欠損値の件数
df.groupby("Species_short")["Body_Mass"].agg(lambda x: x.isna().sum())

NameError: name 'df' is not defined

In [29]:
# 種ごとに平均体重で補完
df.groupby("Species_short")["Body_Mass"].transform(
    lambda x: x.fillna(x.mean())
)

NameError: name 'df' is not defined

### 行どうしに順序がある場合の補間

In [30]:
df.loc[:, "Body_Mass"].head()

NameError: name 'df' is not defined

In [31]:
df.loc[:, "Body_Mass"].fillna(method="ffill")

NameError: name 'df' is not defined

In [32]:
df.loc[:, "Body_Mass"].fillna(method="bfill")

NameError: name 'df' is not defined

In [33]:
df.loc[:, "Body_Mass"].interpolate(method="linear")

NameError: name 'df' is not defined

In [34]:
# "linear"の場合
pd.Series([1.0, None, 10], index=[1, 10, 100]).interpolate(method="linear")

1       1.0
10      5.5
100    10.0
dtype: float64

In [35]:
# "index"の場合（"values"も同じ結果）
pd.Series([1.0, None, 10], index=[1, 10, 100]).interpolate(method="index")

1       1.000000
10      1.818182
100    10.000000
dtype: float64

In [36]:
pd.Series(
    [1.0, None, 10.0],
    index=[
        pd.Timestamp("2023-01-01"),
        pd.Timestamp("2023-01-10"),
        pd.Timestamp("2023-04-10"),
    ],
).interpolate(method="time")

2023-01-01     1.000000
2023-01-10     1.818182
2023-04-10    10.000000
dtype: float64