In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [80]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [81]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [82]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [83]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [84]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [85]:
# One-hot エンコーディング
pd.get_dummies(train, columns=["Embarked"]).head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,False,False,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,False,False,True


In [86]:
# 性別をバイナリ変数に
train["Sex"].map({"male": 0, "female": 1})[:10]

0    0
1    1
2    1
3    1
4    0
5    0
6    0
7    0
8    1
9    1
Name: Sex, dtype: int64

In [87]:
train["Sex"] = train["Sex"].apply(lambda x: 0 if x == "male" else 1)
test["Sex"] = test["Sex"].apply(lambda x: 0 if x == "male" else 1)


In [88]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [89]:
# 欠損値補完
mean_train_Age = train["Age"].mean()
train["Age"].fillna(mean_train_Age, inplace=True)
mean_test_Age = test["Age"].mean()
test["Age"].fillna(mean_test_Age, inplace=True)

mean_train_fare = train["Fare"].mean()
train["Fare"].fillna(mean_train_fare, inplace=True)
mean_test_fare = test["Fare"].mean()
test["Fare"].fillna(mean_test_fare, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Age"].fillna(mean_train_Age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["Age"].fillna(mean_test_Age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [90]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [103]:
# 年齢の正規化
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
train["Age_standard"] = sc.fit_transform(train[["Age"]])
test["Age_standard"] = sc.fit_transform(test[["Age"]])

In [111]:
# Fareの正規化
train["Fare_standard"] = sc.fit_transform(train[["Fare"]])
test["Fare_standard"] = sc.fit_transform(test[["Fare"]])


In [112]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_standard,Fare_standard
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,-0.592481,-0.502445
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,0.638789,0.786845
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,-0.284663,-0.488854
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,0.407926,0.42073
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0.407926,-0.486337


### モデル作成・学習

In [113]:
# 予測に使用する列名
cols = ["Pclass", "Sex", "Age_standard", "Fare_standard"]

# 学習用
X_train = train[cols]
y_train = train["Survived"]

# 予測用
X_test = test[cols]

X_test.head()

Unnamed: 0,Pclass,Sex,Age_standard,Fare_standard
0,3,0,0.334993,-0.498407
1,3,1,1.32553,-0.513274
2,2,0,2.514175,-0.465088
3,3,0,-0.25933,-0.483466
4,3,1,-0.655545,-0.418471


In [114]:
from sklearn.linear_model import LogisticRegression

# モデル作成（ロジスティック回帰）
lg_model = LogisticRegression(solver="liblinear", random_state=42)

# 学習
lg_model.fit(X_train, y_train)
# 予測
y_test = lg_model.predict(X_test)

pd.Series(y_test).value_counts()

0    263
1    155
Name: count, dtype: int64

In [116]:
from sklearn.ensemble import RandomForestClassifier

# モデル作成（ランダムフォレスト）
rf_model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)

# 学習
rf_model.fit(X_train, y_train)
# 予測
y_test = rf_model.predict(X_test)

pd.Series(y_test).value_counts()

0    275
1    143
Name: count, dtype: int64

# 検証

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_validate

folds = KFold(n_splits=3, shuffle=True, random_state=1)

for fold_, (trn_, val_) in enumerate(folds.split(X_train, y_train)):
    trn_x, trn_y = X_train.iloc[trn_], y_train.iloc[trn_]
    val_x, val_y = X_train.iloc[val_], y_train.iloc[val_]

# 学習データで学習
rf_model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
rf_model.fit(trn_x, trn_y)

y_pred = rf_model.predict(val_x)

# 検証データで検証
# scores = cross_validate(rf_model, trn_x, trn_y, cv=folds, scoring=["accuracy"])
# accuracy = scores[""]

In [117]:
"""
import pandas as pd
"""
from sklearn.model_selection import cross_val_score, train_test_split

"""
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
"""
from sklearn.metrics import accuracy_score  # noqa: E402

"""
# データの読み込み
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 必要な前処理（ここでは例としてシンプルなものを示します）
def preprocess_data(data):
    data = data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
    return data

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# データの標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
"""

# 交差検証で正答率を検証
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring="accuracy")
lg_scores = cross_val_score(lg_model, X_train, y_train, cv=5, scoring="accuracy")

print(f"Random Forest Accuracy: {rf_scores.mean():.4f} ± {rf_scores.std():.4f}")
print(f"Logistic Regression Accuracy: {lg_scores.mean():.4f} ± {lg_scores.std():.4f}")

"""
# モデルの学習と予測（必要に応じて）
rf_model.fit(X_scaled, y)
log_reg_model.fit(X_scaled, y)

# テストデータに対する予測（例）
X_test = test_data
X_test_scaled = scaler.transform(X_test)

rf_predictions = rf_model.predict(X_test_scaled)
log_reg_predictions = log_reg_model.predict(X_test_scaled)
"""


Random Forest Accuracy: 0.8059 ± 0.0304
Logistic Regression Accuracy: 0.7856 ± 0.0056


'\n# モデルの学習と予測（必要に応じて）\nrf_model.fit(X_scaled, y)\nlog_reg_model.fit(X_scaled, y)\n\n# テストデータに対する予測（例）\nX_test = test_data\nX_test_scaled = scaler.transform(X_test)\n\nrf_predictions = rf_model.predict(X_test_scaled)\nlog_reg_predictions = log_reg_model.predict(X_test_scaled)\n'

In [110]:
# モデルの学習と予測（必要に応じて）
rf_model.fit(X_train, y_train)
lg_model.fit(X_train, y_train)

# テストデータに対する予測（例）
rf_predictions = rf_model.predict(X_test)

PassengerId = test["PassengerId"]

submission = pd.DataFrame(
    {"PassengerId": PassengerId, "Survived": rf_predictions.astype(np.int32)}
)
submission.to_csv("my_submission.csv", index=False)
