# Cloud Storage

## Amazon S3 Data Warehousing

In [101]:
from __future__ import print_function

In [1]:
import os
import sys
import operator
import functools
import itertools
import boto
import warnings
import json
import pandas as pd
import graphlab as gl
from textblob import TextBlob
from os.path import join as jp

try:
    from configparser import ConfigParser
except ImportError:
    from ConfigParser import ConfigParser
    
gl.canvas.set_target('ipynb')

## S3 Remote Data Warehousing

In [4]:
import matplotlib.pyplot as plt

In [5]:
def aws_config(cfg):
    """
    Queries local environment for aws configurations
    """
    home, user = os.getenv('HOME'), os.getlogin()
    valid_user = user in cfg.sections()

    return user if valid_user else cfg.sections()[0]

def s3_signin(**auth):
    """
    Convenience function for validating keys  and providing
    access to bucket shares.
    
    Returns S3Object
    """
    token_ids  = 'aws_access_key_id', 'aws_secret_access_key'

    cfg = ConfigParser()
    cfg.read(jp(os.getenv('HOME'), '.aws', 'credentials'))

    account    = itertools.repeat(aws_config(cfg), 2)
    valid_auth = all(auth.has_key(i) for i in token_ids)
    token      = zip(account, token_ids) if not valid_auth else [token_ids]
    store      = cfg if not auth else auth
    
    user_id    = dict(zip(token_ids, map(lambda t: store.get(*t), token)))
    
    if not all(user_id.values()):
        raise ValueError('No valid authorization found')

    return boto.connect_s3(**user_id)

## Key and Configuration Management

In [6]:
s3 = s3_signin()

gl.aws.set_credentials(s3.gs_access_key_id, s3.gs_secret_access_key)

## Streaming Remote IO

In [7]:
def remote_json_loader(filename):
    """
    Load JSON from a remote data store.
    """
    sf = gl.SFrame.read_csv(filename, delimiter='\n', header=False)
    return sf.unpack('X1', column_name_prefix='')

def gen_data_url(s3, bucket , dataset):
    s3_dir   = s3.get_bucket(bucket)
    s3_urls  = [
        '/'.join(['s3:/', s3_dir.name, d.name])
                for d in s3_dir.list(dataset)
    ]
    for url in s3_urls:
        yield url

def flatten(sf):
    """
    Flatten nested SFrame DataStructure. 
    """
    dtypes = dict(zip(sf.column_names(), gl.SFrame.dtype(sf)))
    d = {k:v for k,v in dtypes.items() if v in [dict, list]}
    sv = gl.SFrame.copy(sf)
    for p in d:
        sv = sv.unpack(p, column_name_prefix='')
    return sv

# This is Going To Take a While

In [43]:
def unpack_json(fp, index=None):
    return pd.DataFrame.from_records([json.loads(entry) for entry in open(fp)], index=index)

In [47]:
user     = pack_json('s3://ds3-machine-learning/yelp/yelp_academic_dataset_user.json')
review   = pack_json('s3://ds3-machine-learning/yelp/Data/yelp/yelp_academic_dataset_review.json') 
business = pack_json('s3://ds3-machine-learning/yelp/Data/yelp/yelp_academic_dataset_business.json')

# An hour and 8 GB of Memory Later

In [57]:
sf_business = gl.SFrame(business)
sf_user     = gl.SFrame(user)
sf_review   = gl.SFrame(review)

# Join Values using Business ID

In [59]:
review_business_table = sf_review.join(sf_business, how='inner', on='business_id')
review_business_table = review_business_table.rename({'stars.1': 'business_avg_stars', 
                                                      'type.1':  'business_type',
                                                      'review_count': 'business_review_count'})

In [60]:
user_business_review_table = review_business_table.join(sf_user, how='inner', on='user_id')
user_business_review_table = user_business_review_table.rename({'name.1': 'user_name', 
                                                                'type.1': 'user_type', 
                                                                'average_stars': 'user_avg_stars',
                                                                'review_count':  'user_review_count'})

# Split Testing and Training Set

In [61]:
train_set, test_set = user_business_review_table.random_split(0.8, seed=1)

In [63]:
train_set['city'].show()

<IPython.core.display.Javascript object>

# Train Model

In [64]:
model = gl.linear_regression.create(train_set, target='stars', 
                                    features = ['user_avg_stars','business_avg_stars', 
                                                'user_review_count', 'business_review_count', 
                                                'city'])

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 1192538
PROGRESS: Number of features          : 5
PROGRESS: Number of unpacked features : 5
PROGRESS: Number of coefficients    : 375
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 1.106857     | 4.881071           | 4.07

In [65]:
model.evaluate(test_set)

{'max_error': 4.8425586122850355, 'rmse': 1.0301833730270589}

In [66]:
model.summary()

Class                         : LinearRegression

Schema
------
Number of coefficients        : 375
Number of examples            : 1192538
Number of feature columns     : 5
Number of unpacked features   : 5

Hyperparameters
---------------
L1 penalty                    : 0.0
L2 penalty                    : 0.01

Training Summary
----------------
Solver                        : auto
Solver iterations             : 1
Solver status                 : SUCCESS: Optimal solution found.
Training time (sec)           : 1.5014

Settings
--------
Residual sum of squares       : 1258545.0322
Training RMSE                 : 1.0273

Highest Positive Coefficients
-----------------------------
city[Stowe Township]          : 1.4463
city[Delmont]                 : 0.9832
city[Bruchsal]                : 0.8559
city[Victoria Park]           : 0.845
user_avg_stars                : 0.811

Lowest Negative Coefficients
----------------------------
(intercept)                   : -2.1048
city[Balerno]       

# Categorical Features at Scale

In [67]:
model = gl.linear_regression.create(train_set, target='stars', 
                                    features = ['user_id','business_id',
                                                'user_avg_stars','business_avg_stars'],
                                                max_iterations=10)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 1192485
PROGRESS: Number of features          : 4
PROGRESS: Number of unpacked features : 4
PROGRESS: Number of coefficients    : 377891
PROGRESS: Starting L-BFGS
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+-----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 6        | 0.000000

# Dictionary and List Features

In [68]:
train_set['votes'].head(3)

dtype: dict
Rows: 3
[{'funny': 0, 'useful': 2, 'cool': 1}, {'funny': 0, 'useful': 2, 'cool': 0}, {'funny': 0, 'useful': 1, 'cool': 1}]

In [69]:
tags_to_dict = lambda tags: dict(zip(tags, [1 for tag in tags]))

# Using Review Category Tags

In [70]:
train_set['categories_dict'] = train_set.apply(lambda row: tags_to_dict(row['categories']))
train_set['categories_dict'].head(5)

dtype: dict
Rows: 5
[{'Health & Medical': 1, 'Doctors': 1}, {'Health & Medical': 1, 'Doctors': 1}, {'Health & Medical': 1, 'Doctors': 1}, {'Health & Medical': 1, 'Doctors': 1}, {'Health & Medical': 1, 'Doctors': 1}]

In [71]:
model = gl.linear_regression.create(train_set, target='stars', 
                                    features = ['user_id','business_id', 'categories_dict',
                                                'user_avg_stars','votes', 'business_avg_stars'])

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 1192412
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 788
PROGRESS: Number of coefficients    : 378640
PROGRESS: Starting L-BFGS
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+-----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 6        | 0.0000

# Text Data: Using Raw Review Data


In [72]:
train_set['text'].head(1)

dtype: str
Rows: 1
["dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."]

In [114]:
gen_blobs = (TextBlob(i) for i in test_set['text'])
sample    = itertools.islice(gen_blobs, 0, 10)

for blob in sample:
    print("Calculated Polarity and Subjectivity")
    print("====================================")
    print(blob.sentiment.polarity, blob.sentiment.subjectivity, sep='\n', end='\n\n')
    print(blob)
    print("----------\n")

Calculated Polarity and Subjectivity
0.13125
0.440773809524

I'm writing this review to give you a heads up before you see this Doctor. The office staff and administration are very unprofessional. I left a message with multiple people regarding my bill, and no one ever called me back. I had to hound them to get an answer about my bill. 

Second, and most important, make sure your insurance is going to cover Dr. Goldberg's visits and blood work. He recommended to me that I get a physical, and he knew I was a student because I told him. I got the physical done. Later, I found out my health insurance doesn't pay for preventative visits. I received an $800.00 bill for the blood work. I can't pay for my bill because I'm a student and don't have any cash flow at this current time. I can't believe the Doctor wouldn't give me a heads up to make sure my insurance would cover work that wasn't necessary and was strictly preventative. The office can't do anything to help me cover the bill. In addi

In [115]:
train_set['negative_review_tags'] = gl.text_analytics.count_words(train_set['text'])

In [74]:
bad_review_words = (
    'hate','terrible', 'awful', 'spit', 'disgusting', 'filthy', 'tasteless', 'rude', 
    'dirty', 'slow', 'poor', 'late', 'angry', 'flies', 'disappointed', 'disappointing', 'wait', 
    'waiting', 'dreadful', 'appalling', 'horrific', 'horrifying', 'horrible', 'horrendous', 'atrocious', 
    'abominable', 'deplorable', 'abhorrent', 'frightful', 'shocking', 'hideous', 'ghastly', 'grim', 
    'dire', 'unspeakable', 'gruesome'
)
train_set['negative_review_tags'] = train_set['negative_review_tags'].dict_trim_by_keys(bad_review_words, exclude=False)

In [76]:
model = gl.linear_regression.create(train_set, target='stars', 
                                    features = ['user_id', 'business_id', 'categories_dict', 'negative_review_tags', 
                                                'user_avg_stars', 'votes', 'business_avg_stars'])

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 1192662
PROGRESS: Number of features          : 7
PROGRESS: Number of unpacked features : 822
PROGRESS: Number of coefficients    : 378570
PROGRESS: Starting L-BFGS
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+-----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 6        | 0.0000

In [78]:
test_set['categories_dict'] = test_set.apply(lambda row: tags_to_dict(row['categories']))
test_set['categories_dict'].head(5)

dtype: dict
Rows: 5
[{'Health & Medical': 1, 'Doctors': 1}, {'Health & Medical': 1, 'Doctors': 1}, {'Mini Golf': 1, 'Golf': 1, 'Active Life': 1}, {'Mini Golf': 1, 'Golf': 1, 'Active Life': 1}, {'Bars': 1, 'Restaurants': 1, 'Nightlife': 1, 'Lounges': 1, 'American (New)': 1}]

In [77]:
test_set['negative_review_tags'] = gl.text_analytics.count_words(test_set['text'])
test_set['negative_review_tags'] = test_set['negative_review_tags'].dict_trim_by_keys(bad_review_words, exclude=False)

model.evaluate(test_set)

{'max_error': 8.657070551187037, 'rmse': 1.1805045828921503}