Amazon SageMaker Studioを使用し、Kernel Python 3 (Data Science) にて動作確認しています。

In [None]:
# kaggle API install
!pip install kaggle

In [None]:
## kaggle.json を ~/.kaggle/kaggle.json に格納
#!mkdir ~/.kaggle
#mv kaggle.json ~/.kaggle/.
#chmod 600 ~/.kaggle/kaggle.json

In [None]:
# kaggle コマンドでコンペティション一覧表示
!kaggle competitions list

In [None]:
# Pythonコード上でも確認
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
api.competitions_list_cli()

In [None]:
# house prices のデータセットを確認
!kaggle competitions files -c "house-prices-advanced-regression-techniques"

In [None]:
# house prices のデータセットをダウンロード
!kaggle competitions download -c "house-prices-advanced-regression-techniques"

In [None]:
# 展開
!apt-get install unzip
!unzip house-prices-advanced-regression-techniques.zip

In [None]:
# データを確認
import pandas as pd
train_data = pd.read_csv('train.csv')
train_data

In [None]:
test_data = pd.read_csv('test.csv')
test_data

In [None]:
ss_data = pd.read_csv('sample_submission.csv')
ss_data

In [None]:
# Setup SageMaker
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'sagemaker/autopilot-house-prices'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)

In [None]:
# ID列は削除する
train_data_noindex = train_data.drop(columns=['Id'])
test_data_noindex = test_data.drop(columns=['Id'])

In [None]:
# Upload the dataset to S3
train_file = 'train_data.csv';
train_data_noindex.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_file = 'test_data.csv';
test_data_noindex.to_csv(test_file, index=False, header=False)  # バッチ推論時に使う際は、ヘッダ無しにすること
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

In [None]:
# SageMaker Autopilot ジョブの設定
input_data_config = [{
    'DataSource': {
        'S3DataSource': {
            'S3DataType': 'S3Prefix',
            'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
        }
    },
    'TargetAttributeName': 'SalePrice'   # 予測したい項目名: 今回は'SalePrice'
}]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
}

In [None]:
# SageMaker Autopilot ジョブ起動
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%Y%m%d%H%M%S', gmtime())

auto_ml_job_name = 'automl-houseprice-' + timestamp_suffix    # AutoMLJobNameは32文字以内にすること
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig={'CompletionCriteria': {'MaxCandidates': 100}},   # 候補の数を100までで完了する
                      RoleArn=role)

In [None]:
# ジョブのステータスを確認
print ('JobStatus - Secondary Status')
print('------------------------------')

describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(30)

In [None]:
# Result
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

In [None]:
# 最良のモデル候補で、バッチ推論をやってみる
model_name = 'automl-houseprice-model-' + timestamp_suffix

model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))

In [None]:
transform_job_name = 'automl-houseprice-transform-' + timestamp_suffix

transform_input = {
    'DataSource': {
        'S3DataSource': {
            'S3DataType': 'S3Prefix',
            'S3Uri': test_data_s3_path
        }
    },
    'ContentType': 'text/csv',
    'CompressionType': 'None',
    'SplitType': 'Line'
}

transform_output = {
    'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket,prefix),
}

transform_resources = {
    'InstanceType': 'ml.m5.4xlarge',
    'InstanceCount': 1
}

sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources
)

In [None]:
# 完了するまで待つ
print ('JobStatus')
print('----------')


describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (job_run_status)
    sleep(30)

In [None]:
# 推論結果を表示
s3_output_key = '{}/inference-results/test_data.csv.out'.format(prefix);
local_inference_results_path = 'inference_results.csv'

s3 = boto3.resource('s3')
inference_results_bucket = s3.Bucket(session.default_bucket())

inference_results_bucket.download_file(s3_output_key, local_inference_results_path);

data = pd.read_csv(local_inference_results_path, sep=',', names=['SalePrice'])
pd.set_option('display.max_rows', 10)         # Keep the output on one page
data

In [None]:
# submit用データ作成
submit_data = pd.merge(test_data[['Id']], data, left_index=True, right_index=True)  # Id列を連結
submit_data.to_csv('submit.csv', index=False, header=True)
submit_data

In [None]:
# Kaggleにsubmit
!kaggle competitions submit -f submit.csv -m "autopilot test" "house-prices-advanced-regression-techniques"