In [1]:
from boto.s3.connection import S3Connection
from boto.s3.key import Key

conn = S3Connection()

In [2]:
# Get the bucket. 
# Warning: This will fail if your bucket name has a period in it. 
# Keep reading to see how to fix it.

website_bucket = conn.create_bucket('dsci.web')

CertificateError: hostname 'dsci.web.s3.amazonaws.com' doesn't match either of '*.s3.amazonaws.com', 's3.amazonaws.com'

In [3]:
# This failed because our bucket name has a period in it. 
# Here is the workaround for this bug.

import ssl
if hasattr(ssl, '_create_unverified_context'):
    ssl._create_default_https_context = ssl._create_unverified_context

In [4]:
# Now lets try to get the bucket again.

website_bucket = conn.create_bucket('dsci.web')

In [5]:
website_bucket.set_policy('''{
  "Version":"2012-10-17",
  "Statement": [{
    "Sid": "Allow Public Access to All Objects",
    "Effect": "Allow",
    "Principal": "*",
    "Action": "s3:GetObject",
    "Resource": "arn:aws:s3:::%s/*"
  }
 ]
}''' % website_bucket.name)

True

In [6]:
index_html = '''<!DOCTYPE html>
<html>
  <body>
    <p>Hello, World!</p>
  </body>
</html>
'''

In [7]:
index_key = website_bucket.new_key('index.html')
index_key.content_type = 'text/html'
index_key.set_contents_from_string(index_html, policy='public-read')

75

In [8]:
error_html = '''<!DOCTYPE html>
<html>
  <body>
    <p>This is an error page.</p>
  </body>
</html>
'''

In [9]:
error_key = website_bucket.new_key('error.html')
error_key.content_type = 'text/html'
error_key.set_contents_from_string(error_html, policy='public-read')

84

In [10]:
website_bucket.configure_website('index.html', 'error.html')

True

---

In [11]:
from boto.s3.connection import S3Connection
from boto.s3.key import Key

conn = S3Connection()

In [12]:
!curl -o shakespeare-sonnets.txt http://www.gutenberg.org/cache/epub/1041/pg1041.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  119k  100  119k    0     0   203k      0 --:--:-- --:--:-- --:--:--  217k


In [13]:
website_bucket = conn.get_bucket('dsci.web')

In [14]:
k = website_bucket.new_key('shakespeare-sonnets.txt')
k.set_contents_from_filename('shakespeare-sonnets.txt')

122777

In [15]:
sonnets = website_bucket.get_key('shakespeare-sonnets.txt')
text = sonnets.get_contents_as_string()

for line in text.split('\n')[:10]: 
    print line

﻿The Project Gutenberg EBook of Shakespeare's Sonnets, by William Shakespeare

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org


Title: Shakespeare's Sonnets



In [16]:
text.lower().split()[:10]

['\xef\xbb\xbfthe',
 'project',
 'gutenberg',
 'ebook',
 'of',
 "shakespeare's",
 'sonnets,',
 'by',
 'william',
 'shakespeare']

In [17]:
from collections import Counter

In [18]:
wc = Counter(text.lower().split())

In [19]:
import pandas as pd

In [20]:
wc_frame = pd.DataFrame(wc.most_common(20))

In [21]:
wc_frame.index = wc_frame.index + 1

In [22]:
output_file = website_bucket.new_key('shakespeare-word-freq.txt')
output_file.content_type = 'text'
output_file.set_contents_from_string(wc_frame.to_string(), policy='public-read')

293

[dsci.web/shakespeare-word-freq.txt](http://dsci.web.s3-website-us-east-1.amazonaws.com/shakespeare-word-freq.txt)