In [2]:
# %load pyspark_init.py
"""
Load packages and create context objects...
"""
import os
import platform
import sys
if not 'sc' in vars():
    sys.path.append('/usr/hdp/2.4.2.0-258/spark/python')
    os.environ["SPARK_HOME"] = '/usr/hdp/2.4.2.0-258/spark'
    os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.11:1.2.0 pyspark-shell'
    import py4j
    import pyspark
    from pyspark.context import SparkContext, SparkConf
    from pyspark.sql import SQLContext, HiveContext
    from pyspark.storagelevel import StorageLevel
    sc = SparkContext()
    import atexit
    atexit.register(lambda: sc.stop())
    print("""Welcome to
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version %s
          /_/
    """ % sc.version)
else:
    print("""Already running
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version %s
          /_/
    """ % sc.version)

if not 'sqlCtx' in vars():
    sqlCtx = SQLContext(sc)
print 'Spark Context available as `sc`'
print 'Spark SQL Context (%s) available as `sqlCtx`'%str(type(sqlCtx))
print "Monitor this application at http://arc.insight.gsu.edu:8088/proxy/"+sc.applicationId



Already running
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version 1.6.1
          /_/
    
Spark Context available as `sc`
Spark SQL Context (<class 'pyspark.sql.context.HiveContext'>) available as `sqlCtx`
Monitor this application at http://arc.insight.gsu.edu:8088/proxy/application_1484597252711_0272


In [25]:
review_rdd = sc.textFile('/Users/Peter/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json').sample(False, 0.01, 42)

In [5]:
review_rdd.first()

u'{"votes": {"funny": 0, "useful": 1, "cool": 0}, "user_id": "t6OkrXgpcAZfJa2om9QO4A", "review_id": "QSu0l7koHMlTIhWbiiKMxg", "stars": 3, "date": "2015-07-02", "text": "Friendly local bar with great service and good food,  won\'t be disappointed if you like bar food at a reasonable price.  Good place to go for \\"Friday fish sandwich \\"", "type": "review", "business_id": "KayYbHCt-RkbGcPdGOThNg"}'

In [27]:
rtrain_rdd, rtest_rdd = review_rdd.randomSplit([0.8, 0.2])

In [7]:
rtrain_rdd.count()

21422

In [None]:
text =  "Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road."

In [8]:
def text2words(text):
    import re
    def clean_text(text):
        return re.sub(r'[.;:,!\'"]', ' ', unicode(text).lower())
    return filter(lambda x: x!='', clean_text(text).split(' '))

In [None]:
text2words(text)

In [9]:
def json_review(s):
    import json
    r = json.loads(s.strip())
    return (r['stars'], r['text'])

In [10]:
rtrain_rdd.map(json_review).take(10)

[(3,
  u'Friendly local bar with great service and good food,  won\'t be disappointed if you like bar food at a reasonable price.  Good place to go for "Friday fish sandwich "'),
 (5,
  u'This is our favorite breakfast spots.  Fast friendly service. You get more than what you pay for. Awesome!'),
 (1,
  u"By far the worst most unprofessional dentist I have ever experienced her staff is too busy watching soap operas on all the TVs in each room instead of cleaning and she gave me a filling which was too deep which could have been understandable as her son had his pictures delivered from Hawaii and she had to stop 3 times to pay for them and show her whole staff all of them must be nice to have a 14 year old go to Hawaii on her moms dime and have her patience wait to see all of this happen I had to have oral surgery and have the tooth removed because she put the filling in too deep an abscessed tooth was the result I wouldn't recommend her to an ex girlfriend or someone I hate because of 

In [28]:
##word_train_rdd = rtrain_rdd.flatMap(lambda r: [(r[0], w) for w in text2words(r[1])])
word_train_rdd = rtrain_rdd.map(json_review).flatMap(lambda r: [(r[0], w) for w in text2words(r[1])])

In [12]:
word_train_rdd.take(10) ## .groupByKey().take(10)

[(3, u'friendly'),
 (3, u'local'),
 (3, u'bar'),
 (3, u'with'),
 (3, u'great'),
 (3, u'service'),
 (3, u'and'),
 (3, u'good'),
 (3, u'food'),
 (3, u'won')]

In [None]:
import numpy as np

In [13]:
def stars_one_hot(r):
    import numpy as np
    s = np.zeros(5)
    s[r[0]-1] = 1
    return (r[1], s)

In [15]:
words_train_oh_rdd = word_train_rdd.map(stars_one_hot).take(10)

In [19]:
def sum_one_hot_stars(vs):
    import numpy as np
    n = 0
    sum_s = np.zeros(5)
    for v in vs:
        n += 1
        sum_s += v
    return (sum_s, n)

In [29]:
word_count = word_train_rdd\
    .map(stars_one_hot).groupByKey()\
    .map(lambda (k,vs): (k, sum_one_hot_stars(vs)))

In [30]:
word_count.take(4)

[(u'mardi', (array([ 0.,  0.,  0.,  1.,  1.]), 2)),
 (u')it', (array([ 1.,  0.,  0.,  0.,  0.]), 1)),
 (u'outlining', (array([ 0.,  0.,  0.,  1.,  0.]), 1)),
 (u'kid/family', (array([ 0.,  0.,  0.,  0.,  1.]), 1))]

Compute number of documents per star rating

In [31]:
rtrain_rdd.map(json_review).map(lambda t: (t[0], 1.0)).reduceByKey(lambda a,b: a+b).take(10)

[(1, 2722.0), (2, 1818.0), (3, 2502.0), (4, 5243.0), (5, 9147.0)]

In [35]:
def calc_perc_freq(t):
    import numpy as np
    freq = t[1][0]
    tot = t[1][1]
    freq/float(tot)
    return (t[0], freq/float(tot))

In [34]:
word_count.sortBy(lambda r: r[1][1], ascending=False).take(10)

[(u'the', (array([ 17847.,  13450.,  16996.,  30849.,  38942.]), 118084)),
 (u'and', (array([ 11872.,   7709.,   9748.,  20508.,  32636.]), 82473)),
 (u'i', (array([ 12618.,   7913.,   9304.,  16307.,  24407.]), 70549)),
 (u'a', (array([  8606.,   6251.,   8940.,  17368.,  20886.]), 62051)),
 (u'to', (array([ 11862.,   6562.,   7717.,  12924.,  19455.]), 58520)),
 (u'was', (array([  6971.,   5409.,   6668.,  10388.,  12798.]), 42234)),
 (u'it', (array([  5347.,   4102.,   5544.,   9365.,  11333.]), 35691)),
 (u'of', (array([  5004.,   3788.,   5081.,   9761.,  11955.]), 35589)),
 (u'is', (array([  3145.,   2575.,   3901.,   8142.,  11930.]), 29693)),
 (u'for', (array([ 4453.,  2937.,  3874.,  7323.,  9461.]), 28048))]

In [36]:
word_count.sortBy(lambda r: r[1][1], ascending=False).map(calc_perc_freq).take(10)

[(u'the',
  array([ 0.15113817,  0.11390197,  0.14393144,  0.26124623,  0.32978219])),
 (u'and',
  array([ 0.14395014,  0.09347302,  0.11819626,  0.2486632 ,  0.39571739])),
 (u'i',
  array([ 0.17885441,  0.11216318,  0.13187997,  0.23114431,  0.34595813])),
 (u'a',
  array([ 0.13869237,  0.10073971,  0.14407504,  0.27989879,  0.33659409])),
 (u'to',
  array([ 0.20269993,  0.1121326 ,  0.13186945,  0.22084757,  0.33245044])),
 (u'was',
  array([ 0.16505659,  0.12807217,  0.15788227,  0.24596297,  0.303026  ])),
 (u'it',
  array([ 0.14981368,  0.11493093,  0.15533328,  0.26239108,  0.31753103])),
 (u'of',
  array([ 0.14060524,  0.10643738,  0.14276883,  0.27427014,  0.3359184 ])),
 (u'is',
  array([ 0.10591722,  0.08672078,  0.13137777,  0.27420604,  0.4017782 ])),
 (u'for',
  array([ 0.15876355,  0.10471335,  0.13812037,  0.26108813,  0.3373146 ]))]

In [46]:
def entropy(t):
    import numpy as np
    p = t[1]
    return (t[0], t[1], -np.sum(np.log(p)*p))

In [47]:
word_count.sortBy(lambda r: r[1][1], ascending=False).map(calc_perc_freq).map(entropy).take(10)

[(u'the',
  array([ 0.15113817,  0.11390197,  0.14393144,  0.26124623,  0.32978219]),
  1.5285303036793383),
 (u'and',
  array([ 0.14395014,  0.09347302,  0.11819626,  0.2486632 ,  0.39571739]),
  1.4658583524387434),
 (u'i',
  array([ 0.17885441,  0.11216318,  0.13187997,  0.23114431,  0.34595813]),
  1.526175649047415),
 (u'a',
  array([ 0.13869237,  0.10073971,  0.14407504,  0.27989879,  0.33659409]),
  1.5072521715397478),
 (u'to',
  array([ 0.20269993,  0.1121326 ,  0.13186945,  0.22084757,  0.33245044]),
  1.5356872811173026),
 (u'was',
  array([ 0.16505659,  0.12807217,  0.15788227,  0.24596297,  0.303026  ]),
  1.558763921237214),
 (u'it',
  array([ 0.14981368,  0.11493093,  0.15533328,  0.26239108,  0.31753103]),
  1.5376271691888137),
 (u'of',
  array([ 0.14060524,  0.10643738,  0.14276883,  0.27427014,  0.3359184 ]),
  1.5134400071539653),
 (u'is',
  array([ 0.10591722,  0.08672078,  0.13137777,  0.27420604,  0.4017782 ]),
  1.4376387152249903),
 (u'for',
  array([ 0.1587635

In [48]:
word_freq = word_count.map(calc_perc_freq).map(entropy)

In [53]:
word_freq.filter(lambda x: ~np.isnan(x[2])).sortBy(lambda x: x[2], ascending=True).take(10)

[(u'thorough',
  array([ 0.02020202,  0.01010101,  0.04040404,  0.08080808,  0.84848485]),
  0.59758838304492856),
 (u'talented',
  array([ 0.03389831,  0.03389831,  0.03389831,  0.06779661,  0.83050847]),
  0.68087210263012732),
 (u'gem',
  array([ 0.01449275,  0.00483092,  0.01449275,  0.23188406,  0.73429952]),
  0.71417211971840555),
 (u'caring',
  array([ 0.04054054,  0.04054054,  0.01351351,  0.10810811,  0.7972973 ]),
  0.73917434650302383),
 (u'andrew',
  array([ 0.03846154,  0.03846154,  0.03846154,  0.07692308,  0.80769231]),
  0.74574017058641384),
 (u'chris', array([ 0.01,  0.02,  0.05,  0.15,  0.77]), 0.75989764176251928),
 (u'highly',
  array([ 0.0385439 ,  0.01498929,  0.02141328,  0.16381156,  0.76124197]),
  0.77477704911400291),
 (u'carlos',
  array([ 0.04166667,  0.04166667,  0.08333333,  0.04166667,  0.79166667]),
  0.78927737346118487),
 (u'cousins',
  array([ 0.05263158,  0.05263158,  0.05263158,  0.05263158,  0.78947368]),
  0.80650460987522199),
 (u'mattress',
 

In [54]:
word_freq.filter(lambda x: ~np.isnan(x[2])).sortBy(lambda x: x[2], ascending=False).take(10)

[(u'substitutions', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'thy', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'\nother', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'cushions', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'method', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'stacking', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'(does', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'bitches', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'modest', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005),
 (u'strapped', array([ 0.2,  0.2,  0.2,  0.2,  0.2]), 1.6094379124341005)]

In [38]:
import numpy as np
x = np.array([ 0.10591722,  0.08672078,  0.13137777,  0.27420604,  0.4017782 ])

In [43]:
-np.sum(np.log(x)*x)

1.4376387251647471

In [41]:
np.array([1,0])*np.array([3,7])

array([3, 0])

In [None]:
t = np.zeros(5)
t[2] = 1

In [None]:
s+t

In [None]:
np.sum([s, t])

In [None]:
unicode.lower