In [1]:
import pandas as pd
import random
import numpy as np

In [19]:
import boto3
import pickle
import io
from urllib.parse import urlparse

def load_pickle_from_s3(s3_uri):
    # S3 URIを分解
    parsed = urlparse(s3_uri)
    bucket = parsed.netloc
    key = parsed.path.lstrip("/")

    # S3からバイナリデータを取得
    s3 = boto3.client("s3")
    response = s3.get_object(Bucket=bucket, Key=key)
    binary_data = response['Body'].read()

    # pickleとして読み込む
    return pickle.load(io.BytesIO(binary_data))

# 使用例
X_train_pkl = "s3://datascience-keigo/kaggle-SpaceshipTitanic/processed/CV_1/X_train.pkl"
Y_train_pkl = "s3://datascience-keigo/kaggle-SpaceshipTitanic/processed/CV_1/Y_train.pkl"
X_valid_pkl = "s3://datascience-keigo/kaggle-SpaceshipTitanic/processed/CV_1/X_valid.pkl"
Y_valid_pkl = "s3://datascience-keigo/kaggle-SpaceshipTitanic/processed/CV_1/Y_valid.pkl"
X_test_pkl = "s3://datascience-keigo/kaggle-SpaceshipTitanic/processed/CV_1/test_X.pkl"
train_X = load_pickle_from_s3(X_train_pkl)
train_Y = load_pickle_from_s3(Y_train_pkl)
valid_X = load_pickle_from_s3(X_valid_pkl)
valid_Y = load_pickle_from_s3(Y_valid_pkl)
test_X = load_pickle_from_s3(X_test_pkl)
print("✅ 読み込み完了:", type(X_train))



✅ 読み込み完了: <class 'pandas.core.frame.DataFrame'>


In [21]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# parameta settings
lgbm_params = {
    "Objective": "binary",
    "random_seed": 1234
}
callbacks = [lgb.early_stopping(stopping_rounds=20, verbose=True)]
# model格納用リスト
models = []

for i in range(3):
    X_train = train_X[i]
    X_valid = valid_X[i]
    Y_train = train_Y[i]
    Y_valid = valid_Y[i]

    X_train['CabinNum'] = pd.to_numeric(X_train['CabinNum'], errors='coerce')
    X_valid['CabinNum'] = pd.to_numeric(X_valid['CabinNum'], errors='coerce')

    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_eval = lgb.Dataset(X_valid, Y_valid, reference=lgb_train)

    model_lgb = lgb.train(lgbm_params,
                          lgb_train,
                          valid_sets=lgb_eval,
                          num_boost_round=100,
                          callbacks=callbacks
                         )
    y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    print(accuracy_score(Y_valid, np.round(y_pred)))
    models.append(model_lgb)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 557
[LightGBM] [Info] Number of data points in the train set: 5212, number of used features: 22
[LightGBM] [Info] Start training from score 0.526477
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[45]	valid_0's l2: 0.178087
0.7199846566935174
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 557
[LightGBM] [Info] Number of data points in the train set: 5213, number of used features: 22
[LightGBM] [Info] Start training from score 0.484174
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[26]	valid_0's l2

In [23]:
preds = []
test_X['CabinNum'] = pd.to_numeric(X_train['CabinNum'], errors='coerce')
test_X['CabinNum'] = pd.to_numeric(X_valid['CabinNum'], errors='coerce')
for model in models:
    pred = model.predict(test_X)
    preds.append(pred)

In [24]:
preds_array = np.array(preds)
preds_mean = np.mean(preds_array, axis=0)

In [25]:
preds_int = (preds_mean > 0.5).astype(int)

In [26]:
# 提出データサンプル取得
submission_df = pd.read_csv("s3://datascience-keigo/kaggle-SpaceshipTitanic/submissions/sample_submission.csv")
submission_df

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False
