# 値の重複

## 重複の有無の確認方法

In [1]:
import pandas as pd

df = pd.read_parquet("data/penguins.parquet")
df_adelie = df.loc[df.loc[:, "Species_short"] == "Adelie"]  # Adelieのみ
print(df_adelie.shape)

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [2]:
df_adelie.nunique(dropna=False)

NameError: name 'df_adelie' is not defined

In [3]:
# データ全体
df_adelie.duplicated().sum()  # 単に重複の有無を調べる場合はany()メソッド

NameError: name 'df_adelie' is not defined

In [4]:
# subsetを指定する例
df_adelie.duplicated(
    subset=[
        "Individual_ID",
    ]
).sum()

NameError: name 'df_adelie' is not defined

In [5]:
# Individual_IDが重複している箇所（すべて、先頭部分）
df_adelie.loc[
    df_adelie.loc[:, "Individual_ID"].duplicated(keep=False)
].sort_values(by="Individual_ID").head()

NameError: name 'df_adelie' is not defined

In [6]:
# Individual_IDが重複している箇所（最初の出現は含まない）
df_adelie.loc[
    df_adelie.loc[:, "Individual_ID"].duplicated(keep="first")
].sort_values(by="Individual_ID").head()

NameError: name 'df_adelie' is not defined

## 重複の発生パターンと対処方法

### 行全体の重複への対処

In [7]:
df_adelie_duplicated = pd.concat(
    [df_adelie, df_adelie]
)  # 重複のあるDataFrameを作成。
print(f"重複あり：{df_adelie_duplicated.shape}")

print(f"重複除去：{df_adelie_duplicated.drop_duplicates().shape}")  # 元と同じ

NameError: name 'df_adelie' is not defined

### 一部の列の重複への対処

In [8]:
# 最初の出現を残す
df_adelie.drop_duplicates(
    subset=[
        "Individual_ID",
    ],
    keep="first",
).head()

NameError: name 'df_adelie' is not defined

In [9]:
# 最後の出現を残す
df_adelie.drop_duplicates(
    subset=[
        "Individual_ID",
    ],
    keep="last",
).head()

NameError: name 'df_adelie' is not defined

In [10]:
df_adelie.sort_values(by="Date_Egg", ascending=False).drop_duplicates(
    subset="Individual_ID", keep="first"
).head()

NameError: name 'df_adelie' is not defined

In [11]:
# 平均値に集約
df_adelie_agg = df_adelie.groupby("Individual_ID")["Flipper_Length"].mean()
df_adelie_agg.head()

NameError: name 'df_adelie' is not defined

In [12]:
df_adelie_agg.shape

NameError: name 'df_adelie_agg' is not defined