# Leveraging Transcribe and Comprehend

In [120]:
#!pip install twitterscraper

## Let's look at the audio files

In [4]:
!aws s3 cp s3://dbsbucketpedro/19-198-0002.mp3  .

Completed 166.5 KiB/166.5 KiB (2.1 MiB/s) with 1 file(s) remainingdownload: s3://dbsbucketpedro/19-198-0002.mp3 to ./19-198-0002.mp3


In [5]:
ls

[0m[01;36m19-198-0002.mp3[0m                   headline-classifier-mxnet.ipynb
[01;36m19-227-0000.flac[0m                  news.txt
blazingtext_word2vec_text8.ipynb  [01;31moutput.tar.gz[0m
car_model_detection.ipynb         [01;34mtensorflow_bring_your_own_2019-03-12[0m/
cuisine-classifier.ipynb          test2.json
[01;34mcustom-mxnet[0m/                     [01;34mtf-src[0m/
DeepAR-Electricity.ipynb          Topic+Classifier.ipynb
headline-classifier.ipynb         TwitterNER.ipynb
headline-classifier-local.ipynb   xgboost_customer_churn.ipynb


In [8]:
import IPython
IPython.display.Audio("19-198-0002.mp3")

## Configure transcribe service and create a transcription job

In [11]:
import boto3
import time
transcribe = boto3.client('transcribe')

job_name = "test3"
job_uri = "s3://dbsbucketpedro/19-198-0002.mp3"
output_bucket ="dbsbucketpedro"

transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    Media={'MediaFileUri': job_uri},
    MediaFormat='mp3',
    LanguageCode='en-US',
    OutputBucketName=output_bucket,
    Settings={
        'ShowSpeakerLabels': False,
        'ChannelIdentification': False
    }
)


while True:
    status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(5)
print(status)

Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
{'TranscriptionJob': {'TranscriptionJobName': 'test3', 'TranscriptionJobStatus': 'COMPLETED', 'LanguageCode': 'en-US', 'MediaSampleRateHertz': 44100, 'MediaFormat': 'mp3', 'Media': {'MediaFileUri': 's3://dbsbucketpedro/19-198-0002.mp3'}, 'Transcript': {'TranscriptFileUri': 'https://s3.ap-southeast-1.amazonaws.com/dbsbucketpedro/test3.json'}, 'CreationTime': datetime.datetime(2019, 3, 20, 14, 53, 38, 219000, tzinfo=tzlocal()), 'CompletionTime': datetime.datetime(2019, 3, 20, 14, 54, 39, 730000, tzinfo=tzlocal()), 'Settings': {'ChannelIdentification': False}}, 'ResponseMetadata': {'RequestId': '191a960b-4b20-11e9-8dda-5f01cd37d57f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1', 'date': 'Wed, 20 Mar 2019 14:54:44 GMT', 'x-amzn-reques

In [12]:
response = transcribe.get_transcription_job(
    TranscriptionJobName='test3'
)

In [13]:
response

{'TranscriptionJob': {'TranscriptionJobName': 'test3',
  'TranscriptionJobStatus': 'COMPLETED',
  'LanguageCode': 'en-US',
  'MediaSampleRateHertz': 44100,
  'MediaFormat': 'mp3',
  'Media': {'MediaFileUri': 's3://dbsbucketpedro/19-198-0002.mp3'},
  'Transcript': {'TranscriptFileUri': 'https://s3.ap-southeast-1.amazonaws.com/dbsbucketpedro/test3.json'},
  'CreationTime': datetime.datetime(2019, 3, 20, 14, 53, 38, 219000, tzinfo=tzlocal()),
  'CompletionTime': datetime.datetime(2019, 3, 20, 14, 54, 39, 730000, tzinfo=tzlocal()),
  'Settings': {'ChannelIdentification': False}},
 'ResponseMetadata': {'RequestId': '1e3bc13a-4b20-11e9-8a5c-5f889331164c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 20 Mar 2019 14:54:53 GMT',
   'x-amzn-requestid': '1e3bc13a-4b20-11e9-8a5c-5f889331164c',
   'content-length': '436',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

## Let's look at the results of the transcription

In [14]:
!aws s3 cp 's3://dbsbucketpedro/test3.json' .

Completed 4.0 KiB/4.0 KiB (103.0 KiB/s) with 1 file(s) remainingdownload: s3://dbsbucketpedro/test3.json to ./test3.json        


In [20]:
import IPython
IPython.display.Audio("19-198-0002.mp3")

In [19]:
import json
from pprint import pprint

with open('test3.json') as f:
    data = json.load(f)

print(data['results']['transcripts'][0]['transcript'])

Neither the author nor the public have any other concern that, as some observation is necessary upon those parts of the work, which thirteen years have made comparatively obsolete.


## Let's analyse the transcription with Comprehend 

In [21]:
import boto3
import json

comprehend = boto3.client(service_name='comprehend', region_name='us-west-2')
#text = "DBS has greatly improved from last year's results"
text=data['results']['transcripts'][0]['transcript']
print('Calling Sentiment Analysis')
print(json.dumps(comprehend.detect_sentiment(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
print('End of Sentiment Analysis \n')

Calling Sentiment Analysis
{
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "162",
            "content-type": "application/x-amz-json-1.1",
            "date": "Wed, 20 Mar 2019 14:58:14 GMT",
            "x-amzn-requestid": "961bed65-4b20-11e9-a83a-2d0f83498938"
        },
        "HTTPStatusCode": 200,
        "RequestId": "961bed65-4b20-11e9-a83a-2d0f83498938",
        "RetryAttempts": 0
    },
    "Sentiment": "NEGATIVE",
    "SentimentScore": {
        "Mixed": 0.09399497509002686,
        "Negative": 0.8744187951087952,
        "Neutral": 0.02755030058324337,
        "Positive": 0.00403594970703125
    }
}
End of Sentiment Analysis 



## Simple example of Comprehend entity extraction

In [1]:
import boto3
import json

comprehend = boto3.client(service_name='comprehend')
text = "As of January, DBS has outperformed all other major banks in Singapore including HSBC"

print('Calling DetectEntities')
print(json.dumps(comprehend.detect_entities(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectEntities\n')

Calling DetectEntities
{
    "Entities": [
        {
            "BeginOffset": 0,
            "EndOffset": 7,
            "Score": 0.9994915723800659,
            "Text": "Netflix",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 30,
            "EndOffset": 33,
            "Score": 0.5787808895111084,
            "Text": "DBS",
            "Type": "ORGANIZATION"
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "208",
            "content-type": "application/x-amz-json-1.1",
            "date": "Wed, 20 Mar 2019 14:06:43 GMT",
            "x-amzn-requestid": "64b1570f-4b19-11e9-97c1-eb4070bd8c3c"
        },
        "HTTPStatusCode": 200,
        "RequestId": "64b1570f-4b19-11e9-97c1-eb4070bd8c3c",
        "RetryAttempts": 0
    }
}
End of DetectEntities



## Query twitter messages to run analysis (similar to call center feedbak analysis)

In [23]:
!pip install twitterscraper

Collecting twitterscraper
  Downloading https://files.pythonhosted.org/packages/38/7d/0bf84247b78d7d223914cbf410e1160203a65d39086aaf8c6cad521cec74/twitterscraper-0.9.3.tar.gz
Collecting coala-utils~=0.5.0 (from twitterscraper)
  Downloading https://files.pythonhosted.org/packages/54/00/74ec750cfc4e830f9d1cfdd4d559f3d2d4ba1b834b78d5266446db3fd1d6/coala_utils-0.5.1-py3-none-any.whl
Collecting bs4 (from twitterscraper)
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: twitterscraper, bs4
  Running setup.py bdist_wheel for twitterscraper ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/45/50/9b/70128bca07e2bf8b5ed3f504002e9e74a6eaa5e756341b6931
  Running setup.py bdist_wheel for bs4 ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built 

In [24]:
from twitterscraper import query_tweets


csv = open('output.txt', "w") 
#"w" indicates that you're writing strings to the file

columnTitleRow = "id, tweet\n"
csv.write(columnTitleRow)
i=0
for tweet in query_tweets("Amazon OR DBS", 10):
    uid = str(i)
    text = tweet.text
    row = uid+','+'"'+tweet.text.replace(',','').replace('\n',' ')+'"' + "\n"
    i=i+1
    csv.write(row)

INFO: queries: ['Amazon OR DBS since:2006-03-21 until:2006-11-13', 'Amazon OR DBS since:2006-11-13 until:2007-07-08', 'Amazon OR DBS since:2007-07-08 until:2008-03-02', 'Amazon OR DBS since:2008-03-02 until:2008-10-25', 'Amazon OR DBS since:2008-10-25 until:2009-06-19', 'Amazon OR DBS since:2009-06-19 until:2010-02-12', 'Amazon OR DBS since:2010-02-12 until:2010-10-07', 'Amazon OR DBS since:2010-10-07 until:2011-06-01', 'Amazon OR DBS since:2011-06-01 until:2012-01-25', 'Amazon OR DBS since:2012-01-25 until:2012-09-18', 'Amazon OR DBS since:2012-09-18 until:2013-05-13', 'Amazon OR DBS since:2013-05-13 until:2014-01-06', 'Amazon OR DBS since:2014-01-06 until:2014-08-31', 'Amazon OR DBS since:2014-08-31 until:2015-04-25', 'Amazon OR DBS since:2015-04-25 until:2015-12-19', 'Amazon OR DBS since:2015-12-19 until:2016-08-12', 'Amazon OR DBS since:2016-08-12 until:2017-04-06', 'Amazon OR DBS since:2017-04-06 until:2017-11-30', 'Amazon OR DBS since:2017-11-30 until:2018-07-25', 'Amazon OR DBS 

## Output messages to txt

In [25]:
import pandas as pd
df=pd.read_csv('output.txt')

In [26]:
df.head()

Unnamed: 0,id,tweet
0,0,Amazon Prime you had me at the knock on my door.
1,1,Amazon just recommended Paradigms of Artificia...
2,2,been online window shopping at amazon all day....
3,3,Opening an envelope from Amazon: Maggie Mason'...
4,4,Trying to figure out what to buy from Amazon. ...


In [27]:
df.iloc[:,1].to_csv('tweets.txt')

## Run batch Comprehend api calls

In [28]:
%%time
import sagemaker
from sagemaker import get_execution_role

role=get_execution_role()
print(role)
sess=sagemaker.Session()

arn:aws:iam::349934754982:role/service-role/AmazonSageMaker-ExecutionRole-20180901T102635
CPU times: user 445 ms, sys: 43.8 ms, total: 488 ms
Wall time: 4.01 s


In [29]:
output_bucket='dbsbucketpedro'
s3 = boto3.resource('s3')
txt = open('tweets.txt', 'rb')
s3.Bucket(output_bucket).put_object(Key='twitterdata/input/tweets.txt', Body=txt)

s3.Object(bucket_name='dbsbucketpedro', key='twitterdata/input/tweets.txt')

In [36]:
import boto3
import json
comprehend = boto3.client(service_name='comprehend')

In [37]:

response = comprehend.start_entities_detection_job(
    InputDataConfig={
        'S3Uri': 's3://dbsbucketpedro/twitterdata/input/tweets.txt',
        'InputFormat':'ONE_DOC_PER_LINE'
    },
    OutputDataConfig={
        'S3Uri': 's3://dbsbucketpedro/twitterdata/output/results.txt'
    },
    DataAccessRoleArn=role,
    JobName='tweet-analysis2',
    LanguageCode='en'
)

In [39]:
!aws s3 cp s3://dbsbucketpedro/twitterdata/output/results.txt/349934754982-NER-990f64a5e84f2b7546abeae5970bd30a/output/output.tar.gz .


Completed 32.9 KiB/32.9 KiB (390.0 KiB/s) with 1 file(s) remainingdownload: s3://dbsbucketpedro/twitterdata/output/results.txt/349934754982-NER-990f64a5e84f2b7546abeae5970bd30a/output/output.tar.gz to ./output.tar.gz


In [40]:
!tar -xvf output.tar.gz

output


In [41]:
!mv output output.json

In [42]:
import json
from pprint import pprint

responses=[]
with open('output.json') as f:
    for i in f:
        responses.append(json.loads(i))


In [43]:
len(df)

346

In [44]:
responses[2]['Entities'][1]['Text']

'amazon'

In [45]:
len(responses)

346

In [46]:
responses[1]['Entities'][1]['Text']

'Amazon'

In [47]:
responses[39]

{'Entities': [{'BeginOffset': 0,
   'EndOffset': 2,
   'Score': 0.9226970672607422,
   'Text': '39',
   'Type': 'QUANTITY'},
  {'BeginOffset': 60,
   'EndOffset': 63,
   'Score': 0.4131833016872406,
   'Text': 'one',
   'Type': 'QUANTITY'}],
 'File': 'tweets.txt',
 'Line': 39}

In [48]:
df['Org']='Na'
df['Score']='Na'
for i in range(len(df)):
    try:
        df['Org'][i]=responses[i]['Entities'][1]['Text']
        df['Score'][i]=responses[i]['Entities'][1]['Score']
    except Exception:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [49]:
df.head(30)

Unnamed: 0,id,tweet,Org,Score
0,0,Amazon Prime you had me at the knock on my door.,Amazon,0.996473
1,1,Amazon just recommended Paradigms of Artificia...,Amazon,0.998203
2,2,been online window shopping at amazon all day....,amazon,0.995901
3,3,Opening an envelope from Amazon: Maggie Mason'...,Amazon,0.993774
4,4,Trying to figure out what to buy from Amazon. ...,Amazon,0.998071
5,5,"trying to order the new"" R.E.M. CD from amazon...",R.E.M. CD,0.641165
6,6,just got free Amazon Prime 2-day shipping for ...,Amazon,0.993983
7,7,telene's icon is quite the amazon. goodnight i...,amazon,0.957353
8,8,amazon strippers at 3 o'clock.,amazon,0.98877
9,9,Lost all my album art during the last iTunes i...,iTunes,0.912235
