# Data download: sns.load_dataset()

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 데이터 불러오기
titanic = sns.load_dataset('titanic')  # seaborn 내장 데이터셋
print("데이터 크기:", titanic.shape)
print("\n컬럼 정보:\n", titanic.columns)
print("\n상위 3행:\n", titanic.head(3))

데이터 크기: (891, 15)

컬럼 정보:
 Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

상위 3행:
    survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  


# describe() 함수로 기초 통계량 출력

In [16]:
# 수치형 변수 요약
print(titanic.describe())

         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


# 결측치 확인: isnull() 메서드

In [17]:
print("\n결측치 개수:\n", titanic.isnull().sum())


결측치 개수:
 survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


# 결측치 처리 method 1: drop columns

In [18]:
# 결측치 처리 : drop columns
titanic.drop('deck', axis=1, inplace=True)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [19]:
print("\n컬럼 정보:\n", titanic.columns)


컬럼 정보:
 Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')


# 결측치 처리 method 2: fillna()

In [20]:
# 결측치 처리 method 2: fillna()
# 직접 할당 방식 (가장 권장)
titanic['age'] = titanic['age'].fillna(titanic['age'].mean())

In [21]:
print("\n결측치 개수:\n", titanic.isnull().sum())


결측치 개수:
 survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
embark_town    2
alive          0
alone          0
dtype: int64


# 결측치 처리 method 3: fillna
# Substitution for missing values as mode value of the column : most_frequent

In [22]:
# Embarked 컬럼의 빈 값 처리: 최빈값으로 대체
titanic['embarked'] = titanic['embarked'].fillna(titanic['embarked'].mode()[0])
titanic
# # 데이터에 여러 최빈값이 있을 경우
# data = [1, 1, 2, 2, 3]
# df['column'].mode()  # [1, 2] 반환
# df['column'].mode()[0]  # 1 반환
print("\n결측치 개수:\n", titanic.isnull().sum())


결측치 개수:
 survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    2
alive          0
alone          0
dtype: int64


# Convert data types

In [23]:
# 데이터 타입 변환
# 범주형 변수 변환
titanic['sex'] = titanic['sex'].map({'male': 0, 'female': 1})
titanic['pclass'] = titanic['pclass'].astype('category')
titanic['embarked'] = titanic['embarked'].astype('category')
titanic['survived'] = titanic['survived'].astype('category')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,0,22.000000,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,1,38.000000,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,1,26.000000,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,1,35.000000,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,0,35.000000,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,1,19.000000,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,1,29.699118,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,0,26.000000,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


# drop columns

In [25]:
titanic.drop(columns=['class', 'embark_town'], inplace=True)
print(titanic.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'who', 'adult_male', 'alive', 'alone'],
      dtype='object')


In [26]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,adult_male,alive,alone
0,0,3,0,22.000000,1,0,7.2500,S,man,True,no,False
1,1,1,1,38.000000,1,0,71.2833,C,woman,False,yes,False
2,1,3,1,26.000000,0,0,7.9250,S,woman,False,yes,True
3,1,1,1,35.000000,1,0,53.1000,S,woman,False,yes,False
4,0,3,0,35.000000,0,0,8.0500,S,man,True,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,S,man,True,no,True
887,1,1,1,19.000000,0,0,30.0000,S,woman,False,yes,True
888,0,3,1,29.699118,1,2,23.4500,S,woman,False,no,False
889,1,1,0,26.000000,0,0,30.0000,C,man,True,yes,True


# IQR(Interquartile Range)

In [27]:
# IQR 계산 함수
def calculate_iqr(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    return Q1, Q3, IQR

# 수치형 변수 리스트
numeric_columns = ['age', 'sibsp', 'parch', 'fare']

# 각 수치형 변수에 대해 IQR 계산 및 이상치 확인
for column in numeric_columns:
    Q1, Q3, IQR = calculate_iqr(titanic[column])
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = titanic[(titanic[column] < lower_bound) | (titanic[column] > upper_bound)]
    print(f"{column}의 IQR: {IQR}")
    print(f"{column}의 이상치 개수: {outliers.shape[0]}")
    print(outliers[[column]])
    print("\n")

age의 IQR: 13.0
age의 이상치 개수: 66
       age
7     2.00
11   58.00
15   55.00
16    2.00
33   66.00
..     ...
827   1.00
829  62.00
831   0.83
851  74.00
879  56.00

[66 rows x 1 columns]


sibsp의 IQR: 1.0
sibsp의 이상치 개수: 46
     sibsp
7        3
16       4
24       3
27       3
50       4
59       5
63       3
68       4
71       5
85       3
88       3
119      4
159      8
164      4
171      4
176      3
180      8
182      4
201      8
229      3
233      4
261      4
266      4
278      4
324      8
341      3
374      3
386      5
409      3
480      5
485      3
541      4
542      4
634      3
642      3
683      5
686      4
726      3
787      4
792      8
813      4
819      3
824      4
846      8
850      4
863      8


parch의 IQR: 0.0
parch의 이상치 개수: 213
     parch
7        1
8        2
10       1
13       5
16       1
..     ...
871      1
879      1
880      1
885      5
888      2

[213 rows x 1 columns]


fare의 IQR: 23.0896
fare의 이상치 개수: 116
         fare
1     71.2833
2