In [74]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/home/lms/project/titanic_survive/titanic/train.csv')

In [75]:
df.head() ## 상위 5개의 데이터

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [76]:
df ## 모든 데이터 // NAN = Not a Number(결측지)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


NAN 부분을 특정 숫자로 바꿔줄 것 - 데이터 전처리

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# 데이터 전처리
1. 5개의 Feature가 object 형식으로 되어있는데, 이는 이후 형번환을 한다
2. Age와 Cabin, Embarked, Feature의 데이터가 일부 누락되었기 때문에 이를 처리한다.

### 데이터 클리닝
- sex(성별) Feature의 male, female을 0, 1로 변경한다
- Embarked(탑승지) Feature의 C(Cherbourg), Q(Queenstown), S(Southampton)를 0, 1, 2로 변경한다.

In [78]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,2.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,2.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,2.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,2.0


- Name Feature에서 Mr. Mrs. Ms. 등으로 기혼여부와 성별을 알아낼 수 있다.
- 이제 이름 데이터는 필요 없기에 열 제거한다.

In [79]:
def extract_title(name):
    if "Mr." in name:
        return 0  # 남성
    elif "Mrs." in name:
        return 1  # 기혼 여성
    elif "Miss." in name:
        return 2  # 미혼 여성
    else:
        return -1  # 기타 (예외 처리용)

# Title 열 생성
df['Title'] = df['Name'].apply(extract_title)
df.drop('Name', axis=1, inplace=True)

df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,0,22.0,1,0,A/5 21171,7.2500,,2.0,0
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C85,0.0,1
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.9250,,2.0,2
3,4,1,1,1,35.0,1,0,113803,53.1000,C123,2.0,1
4,5,0,3,0,35.0,0,0,373450,8.0500,,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.0,0,0,211536,13.0000,,2.0,-1
887,888,1,1,1,19.0,0,0,112053,30.0000,B42,2.0,2
888,889,0,3,1,,1,2,W./C. 6607,23.4500,,2.0,2
889,890,1,1,0,26.0,0,0,111369,30.0000,C148,0.0,0


1. Age Feature의 결측 데이터를 Name Feature에서 알아낸 기혼 여부와 성별을 바탕으로 대체한다.

In [80]:
df["Age"].fillna(df.groupby("Title")["Age"].transform("median"), inplace=True)
# Title Feature의 값이 같은 사람들의 중앙값으로 NaN데이터를 각 이름에 해당하는 사람들의 중앙값으로 대체한다.

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df.groupby("Title")["Age"].transform("median"), inplace=True)


In [81]:
df.info() # age데이터에 891개의 데이터 생성을 완료하였다.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    float64
 11  Title        891 non-null    int64  
dtypes: float64(3), int64(7), object(2)
memory usage: 83.7+ KB


In [82]:
# 나이에 따른 분류 함수 정의
def classify_age(age):
    if age <= 16:
        return 0
    elif 16 < age <= 26:
        return 1
    elif 26 < age <= 36:
        return 2
    elif 36 < age <= 62:
        return 3
    else:
        return 4

# Age 열을 기반으로 Age 열 값 자체를 분류 값으로 대체
df['Age'] = df['Age'].apply(classify_age)
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,0,1,1,0,A/5 21171,7.2500,,2.0,0
1,2,1,1,1,3,1,0,PC 17599,71.2833,C85,0.0,1
2,3,1,3,1,1,0,0,STON/O2. 3101282,7.9250,,2.0,2
3,4,1,1,1,2,1,0,113803,53.1000,C123,2.0,1
4,5,0,3,0,2,0,0,373450,8.0500,,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,2,0,0,211536,13.0000,,2.0,-1
887,888,1,1,1,1,0,0,112053,30.0000,B42,2.0,2
888,889,0,3,1,1,1,2,W./C. 6607,23.4500,,2.0,2
889,890,1,1,0,1,0,0,111369,30.0000,C148,0.0,0
