# 누락 데이터 처리

## 누락값 (NaN)

In [None]:
from numpy import NaN

In [None]:
print(NaN == 0)

In [None]:
print(NaN == '')

In [None]:
print(NaN == NaN)

In [None]:
from numpy import NaN
import pandas as pd

print(pd.isnull(NaN))
print(pd.isnull('abc'))
print(pd.isnull(123))

In [None]:
print(pd.notnull(NaN))
print(pd.notnull('abc'))
print(pd.notnull(123))

## 누락 데이터 확인

In [None]:
import seaborn as sns

titanic = sns.load_dataset('titanic')
titanic.head()

In [None]:
titanic.info()

In [None]:
column_age = titanic['deck'].value_counts(dropna = False)
column_embark_town = titanic['embark_town'].value_counts(dropna = False)
print(column_age)
print('\n')
print(column_embark_town)

In [None]:
titanic.isnull()

In [None]:
titanic.isnull().sum(axis=0)

## 누락 데이터 제거

In [None]:
# NaN 값이 300개 이상 존재하는 열을 모두 삭제

titanic_threshold = titanic.dropna(axis=1, thresh=300)

In [None]:
titanic_threshold

In [None]:
# age 값이 NaN인 데이터는 모두 삭제
# 데이터 개수는 총 891개 => 714개

titanic_age = titanic.dropna(subset=['age'], how='any', axis=0)

In [None]:
titanic_age

## 누락 데이터 치환

In [None]:
import seaborn as sns

titanic = sns.load_dataset('titanic')
print(titanic['age'].head(7))

In [None]:
titanic['age'].fillna(0, inplace=True)
print(titanic['age'].head(7))

In [None]:
import seaborn as sns

titanic = sns.load_dataset('titanic')
print(titanic['age'].head(7))

In [None]:
mean_age = round(titanic['age'].mean())
titanic['age'].fillna(mean_age, inplace=True)
print(titanic['age'].head(7))

In [None]:
import seaborn as sns

titanic = sns.load_dataset('titanic')
print(titanic['embark_town'][60:65])
print('\n')

embark_town_counts = titanic['embark_town'].value_counts(dropna=True)
print(embark_town_counts)

In [None]:
most_freq = embark_town_counts.idxmax()
print(most_freq)
print('\n')

titanic['embark_town'].fillna(most_freq, inplace=True)
print(titanic['embark_town'][60:65])

# 중복 데이터 처리

In [None]:
import pandas as pd

df = pd.DataFrame({
    'col1': ['a', 'a', 'b', 'b', 'a', 'a', 'a'],
    'col2': [1, 1, 2, 1, 1, 3, 1],
    'col3': [1, 1, 2, 2, 2, 1, 2]
})
print(df)
print('\n')
df_duplicate = df.duplicated()
print(df_duplicate)

In [None]:
df = pd.DataFrame({
    'col1': ['a', 'a', 'b', 'b', 'a', 'a', 'a'],
    'col2': [1, 1, 2, 1, 1, 3, 1],
    'col3': [1, 1, 2, 2, 2, 1, 2]
})
print(df)
print('\n')

col_duplicate = df['col2'].duplicated()
print(col_duplicate)

## 중복 데이터 제거

In [None]:
df = pd.DataFrame({
    'col1': ['a', 'a', 'b', 'b', 'a', 'a', 'a'],
    'col2': [1, 1, 2, 1, 1, 3, 1],
    'col3': [1, 1, 2, 2, 2, 1, 2]
})
print(df)
print('\n')
df_duplicate = df.drop_duplicates()
print(df_duplicate)

In [None]:
df = pd.DataFrame({
    'col1': ['a', 'a', 'b', 'b', 'a', 'a', 'a'],
    'col2': [1, 1, 2, 1, 1, 3, 1],
    'col3': [1, 1, 2, 2, 2, 1, 2]
})
print(df)
print('\n')
df_duplicate = df.drop_duplicates(['col2', 'col3'])
print(df_duplicate)

# 판다스 자료형

## 자료형 다루기

In [None]:
import pandas as pd
import seaborn as sns

tips = sns.load_dataset('tips')
tips.head()

In [None]:
tips.info()

In [None]:
tips['sex_string'] = tips['sex'].astype(str)
tips.info()

In [None]:
tips['size'] = tips['size'].astype(str)
tips.info()

In [None]:
tips['size']

In [None]:
tips['size'][0]

In [None]:
tips['size'] = tips['size'].astype(int)
tips.info()

In [None]:
tips['size']

In [None]:
tips['size'][0]

## 잘못 입력한 데이터 처리하기

In [None]:
tips_new = tips
tips_new.loc[[1,3,5,7], 'total_bill'] = 'None'
tips_new.head(10)

In [None]:
tips_new.info()

In [None]:
tips_new['total_bill'].astype(float)

In [None]:
pd.to_numeric(tips_new['total_bill'])

In [None]:
tips_new['total_bill'] = pd.to_numeric(tips_new['total_bill'], errors='coerce')
tips_new.head(10)

## 카테고리 자료형

In [None]:
tips['sex'] = tips['sex'].astype('category')
tips.head()

In [None]:
tips['sex'].cat.categories

In [None]:
tips['sex'].cat.codes[0:5]

### 데이터 크기 비교

In [None]:
tips['sex'] = tips['sex'].astype('str')
tips.info()

In [None]:
tips.head()

In [None]:
tips['sex'] = tips['sex'].astype('category')
tips.info()

In [None]:
tips.head()