# 概要
* livedoorニュースの記事に対して Comprehend の topic model を使ってみる

In [None]:
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar -zxvf ldcc-20140209.tar.gz
!rm text/CHANGES.txt
!rm text/README.txt
!rm text/topic-news/LICENSE.tx
!rm text/dokujo-tsushin/LICENSE.txt

In [None]:
import boto3,time,sagemaker
import pandas as pd

In [None]:
client = boto3.client('comprehend')

In [None]:
# 設定値
BUCKET = sagemaker.Session().default_bucket()
INPUT_S3_LOCATION = 's3://'+BUCKET+'/comprehend/topic_model/input'
OUTPUT_S3_LOCATION = 's3://'+BUCKET+'/comprehend/topic_model/output'
JOBNAME = 'MySecondJob'
ROLE = 'arn:aws:iam::155580384669:role/comprehend_role'

In [None]:
%%time
sagemaker.s3.S3Uploader.upload('./text/',INPUT_S3_LOCATION)

In [None]:
response = client.start_topics_detection_job(
    NumberOfTopics=10,
    InputDataConfig={
        'S3Uri': INPUT_S3_LOCATION,
        'InputFormat': 'ONE_DOC_PER_FILE' # 1ファイルで1ドキュメント扱い、他にはONE_DOC_PER_LINEで1行1ドキュメント扱いも可能
    },
    OutputDataConfig={
        'S3Uri':OUTPUT_S3_LOCATION
    },
    DataAccessRoleArn=ROLE,
    JobName=JOBNAME
)
JOBID = response['JobId']

In [None]:
OUTPUT_S3_FILE_LOCATION = client.describe_topics_detection_job(JobId=JOBID)['TopicsDetectionJobProperties']['OutputDataConfig']['S3Uri']
print(OUTPUT_S3_FILE_LOCATION)

In [None]:
sagemaker.s3.S3Downloader.download(OUTPUT_S3_FILE_LOCATION,'./')

In [None]:
!tar zxvf output.tar.gz

In [None]:
pd.read_csv('./doc-topics.csv').head()

In [None]:
pd.read_csv('./topic-terms.csv').head()