In [188]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df_titanic = pd.read_csv('/apps/study_machinelearning/datasets/titanic_train.csv')

df_titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [189]:
# 각 컬럼의 결측치 개수 확인
missing_counts = df_titanic.isnull().sum()
print(f"각 컬럼의 결측치 개수:\n{missing_counts}")


각 컬럼의 결측치 개수:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [190]:
# 'Cabin' 컬럼 제거
df_titanic = df_titanic.drop('Cabin', axis=1)
df_titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S


In [191]:
df = pd.DataFrame(df_titanic)
# 수치형과 범주형 컬럼 분류
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

In [192]:
# Embarked 컬럼의 결측치를 최빈값으로 채우기
df_titanic['Embarked'].fillna(df_titanic['Embarked'].mode()[0], inplace=True)

In [193]:
from sklearn.linear_model import LinearRegression
import numpy as np

In [194]:
print(age_missing_features.head())
print(f"Age 결측치가 있는 행 개수: {age_missing_features.shape[0]}")

    Pclass  SibSp  Parch     Fare
5        3      0      0   8.4583
17       2      0      0  13.0000
19       3      0      0   7.2250
26       3      0      0   7.2250
28       3      0      0   7.8792
Age 결측치가 있는 행 개수: 177


In [195]:
# Age 컬럼 결측치 머신러닝으로 채우기
age_features = df_titanic[['Pclass', 'SibSp', 'Parch', 'Fare']]
age_target = df_titanic['Age'].dropna()
age_features_no_nan = age_features.loc[age_target.index]

model_age = LinearRegression()
model_age.fit(age_features_no_nan, age_target)



In [196]:
age_missing_features = age_features[df_titanic['Age'].isnull()]
predicted_ages = model_age.predict(age_missing_features)


In [197]:
df_titanic.loc[df_titanic['Age'].isnull(), 'Age'] = predicted_ages

In [198]:
# 예측된 Age 값 확인
print(predicted_ages[:10])

[27.5252062  34.21163708 27.54970144 27.54970144 27.53670802 27.53637832
 34.50862465 27.53927413 27.54961802 27.53637832]


In [199]:
# 결측치가 채워진 후 확인
print(f"Age 결측치 채운 후: {df_titanic['Age'].isnull().sum()}")

Age 결측치 채운 후: 0


In [200]:
# 범주형 데이터 인코딩

encoded_data = ['PassengerId','Name','Sex', 'Ticket','Embarked']
encoder = OneHotEncoder(sparse=False, drop='first')  # 다중공선성 방지
encoded_data = encoder.fit_transform(df_titanic[categorical_columns])



In [201]:
# 인코딩된 컬럼 병합
encoded_columns = encoder.get_feature_names_out(categorical_columns)
df_encoded = pd.DataFrame(encoded_data, columns=encoded_columns, index=df_titanic.index)
df_titanic = pd.concat([df_titanic, df_encoded], axis=1)


In [202]:
# 인코딩 후 원래 범주형 컬럼 제거
df_titanic.drop(categorical_columns, axis=1, inplace=True)

In [203]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel",...,Ticket_W./C. 14258,Ticket_W./C. 14263,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Embarked_Q,Embarked_S
0,1,0,3,22.00000,1,0,7.2500,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,1,38.00000,1,0,71.2833,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,26.00000,0,0,7.9250,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,1,35.00000,1,0,53.1000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,0,3,35.00000,0,0,8.0500,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.00000,0,0,13.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
887,888,1,1,19.00000,0,0,30.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
888,889,0,3,21.00954,1,2,23.4500,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
889,890,1,1,26.00000,0,0,30.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
# 데이터셋 분할 (특성 및 타겟)
X = df_titanic.drop('Survived', axis=1)
y = df_titanic['Survived']

# 학습/검증 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [205]:
features = ['Pclass', 'Age', 'SibSp', 'Fare', 'Sex', 'Embarked', 'Name']
target = 'Survived'

In [206]:
from sklearn.ensemble import RandomForestRegressor

# 모델 학습
model_age = RandomForestRegressor()
model_age.fit(X_age_not_missing, y_age_not_missing)

In [207]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 모델 학습
model = RandomForestClassifier()
model.fit(X_train, y_train)

# 모델 평가
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8212


In [208]:
# 결과 확인
print(df_titanic.info())
print(df_titanic.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 1580 entries, PassengerId to Embarked_S
dtypes: float64(1575), int64(5)
memory usage: 10.7 MB
None
   PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  \
0            1         0       3  22.0      1      0   7.2500   
1            2         1       1  38.0      1      0  71.2833   
2            3         1       3  26.0      0      0   7.9250   
3            4         1       1  35.0      1      0  53.1000   
4            5         0       3  35.0      0      0   8.0500   

   Name_Abbott, Mr. Rossmore Edward  Name_Abbott, Mrs. Stanton (Rosa Hunt)  \
0                               0.0                                    0.0   
1                               0.0                                    0.0   
2                               0.0                                    0.0   
3                               0.0                                    0.0   
4                               0.0    

In [209]:
print(encoded_data[:5])
print(encoder.get_feature_names_out())

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
['Name_Abbott, Mr. Rossmore Edward'
 'Name_Abbott, Mrs. Stanton (Rosa Hunt)' 'Name_Abelson, Mr. Samuel' ...
 'Ticket_WE/P 5735' 'Embarked_Q' 'Embarked_S']


In [210]:
print(df_titanic.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 1580 entries, PassengerId to Embarked_S
dtypes: float64(1575), int64(5)
memory usage: 10.7 MB
None


In [211]:
dataset_path = '/apps/study_machinelearning/datasets/titanic_train.csv'
data = pd.read_csv(dataset_path)

In [212]:
# 사용할 특성들 선택 (4개의 특성만 사용)
features = ['Pclass', 'Age', 'SibSp', 'Fare']
target = 'Survived'


In [213]:
# 훈련 데이터 준비
X = data[features]
y = data[target]

In [214]:
## train/ test 나누기

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 특성(Features)과 타겟(Target) 지정
X = df_titanic.drop(columns=['Survived'])
y = df_titanic['Survived']

# 훈련 세트와 테스트 세트 분리 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 훈련 세트 크기 및 테스트 세트 크기 출력
print(f"훈련 세트 크기: {X_train.shape[0]}")
print(f"테스트 세트 크기: {X_test.shape[0]}")


훈련 세트 크기: 712
테스트 세트 크기: 179


In [216]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [217]:
# 로지스틱 회귀 모델 정의 및 훈련
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# 테스트 세트에서 예측
y_pred = model.predict(X_test)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [218]:
# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"모델 정확도: {accuracy:.4f}")


모델 정확도: 0.8156


In [219]:
# 예측된 결과와 실제 결과 비교
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison.head())

     Actual  Predicted
709       1          0
439       0          0
840       0          0
720       1          1
39        1          1


In [220]:
import pickle

# 모델 저장
with open('titanic_model.pkl', 'wb') as f:
    pickle.dump(model, f)