## データセットをダウンロード

In [None]:
import os
import getpass

if not os.path.exists('titanic/train.csv'):
    kaggle_username = getpass.getpass(prompt='Kaggle Username: ')
    kaggle_key = getpass.getpass(prompt='Kaggle API Key: ')

    os.environ['KAGGLE_USERNAME'] = kaggle_username
    os.environ['KAGGLE_KEY'] = kaggle_key

    !pip install kaggle
    
    !kaggle competitions download -c titanic
    !unzip titanic.zip -d titanic
else:
    print("Files already exist, skipping download.")

import pandas as pd

train_data = pd.read_csv('titanic/train.csv')
test_data = pd.read_csv('titanic/test.csv')

## データの前処理

In [None]:
train_data.fillna(train_data.select_dtypes(include=['float64', 'int64']).mean(), inplace=True)
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data['Embarked'] = train_data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
train_data = train_data.fillna(0)

## 特徴量とラベルの分割

In [None]:
X = train_data.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_data['Survived']

## 訓練データとテストデータの分割

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## モデルの訓練

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## モデルの評価

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

## モデルのアップロード

In [None]:
!pip install boto3

import joblib
import boto3

model_file_name = 'REPLACE_WITH_MODEL_BASE_NAME.joblib'
joblib.dump(model, model_file_name)

s3_bucket = 'REPLACE_WITH_S3_BUCKET_NAME'
model_s3_key = f'models/{model_file_name}'
s3 = boto3.client('s3')
s3.upload_file(model_file_name, s3_bucket, model_s3_key)

print(f'Model uploaded to s3://{s3_bucket}/{model_s3_key}')

if os.path.exists(model_file_name):
    os.remove(model_file_name)
