In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
data = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic.0.csv')
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


#### 컬럼명 변경

In [45]:
data_co = data.copy()

data_co.rename(columns={'Age':'aage'}, inplace=True)
data_co.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'aage', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

#### 데이터 타입 변환

In [39]:
data_co = data.copy()
data_co.info()
print('\n')

data_co['Age'] = data_co['Age'].astype(object)
data_co.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Surviv

## 결측치 처리

#### 평균

In [31]:
data_co = data.copy()
data_co['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [25]:
data_co = data.copy()

mean = data_co['Age'].mean()                   # 평균
median = data_co['Age'].median()               # 중앙값
std = data_co['Age'].std()                     # 표준편차
var = data_co['Age'].var()                     # 분산

data_co['Age'].fillna(mean, inplace=True)
data_co.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### 앞 값

In [32]:
data_co = data.copy()

data_co['Age'].fillna(method='ffill', inplace=True)     # 빈 값의 앞 데이터로 채움.
data_co.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### 최빈값

In [8]:
data_co = data.copy()

more = data_co['Age'].value_counts().idxmax()
data_co['Age'].fillna(more, inplace=True)
data_co.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### 선형보간법

In [14]:
data_co = data.copy()

data_co['Age'].interpolate(method='linear', inplace=True)
data_co.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### 삭제

In [51]:
data_co = data.copy()
data_co.info()
print('\n')

data_co.dropna(subset=['Age'], axis=0, inplace=True)
data_co.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Surviv

## 데이터 삭제

In [13]:
data_co = data.copy()
print(data_co.isna().sum())
print('\n')

cols = ['Embarked', 'Fare', 'Parch']
data_co.drop(columns=cols, axis=1, inplace=True)
print(data_co.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Ticket           0
Cabin          687
dtype: int64


## 데이터 집단화

#### 단순 계산

In [19]:
data_co = data.copy()

data_co['Age_grp'] = (data_co['Age']//10)*10
data_co['Age_grp'].value_counts()

20.0    220
30.0    167
10.0    102
40.0     89
0.0      62
50.0     48
60.0     19
70.0      6
80.0      1
Name: Age_grp, dtype: int64

#### cut

In [20]:
data_co = data.copy()

b = [0,30,60,90]
l = ['A','B','C']

data_co['Age'] = pd.cut(data_co['Age'], bins=b, labels=l)
data_co['Age'].value_counts()

A    409
B    283
C     22
Name: Age, dtype: int64