In [1]:
# %load pyspark_init.py
"""
Load packages and create context objects...
"""
import os
import platform
import sys
if not 'sc' in vars():
    sys.path.append('/usr/hdp/2.4.2.0-258/spark/python')
    os.environ["SPARK_HOME"] = '/usr/hdp/2.4.2.0-258/spark'
    os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.11:1.2.0 pyspark-shell'
    import py4j
    import pyspark
    from pyspark.context import SparkContext, SparkConf
    from pyspark.sql import SQLContext, HiveContext
    from pyspark.storagelevel import StorageLevel
    sc = SparkContext()
    import atexit
    atexit.register(lambda: sc.stop())
    print("""Welcome to
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version %s
          /_/
    """ % sc.version)
else:
    print("""Already running
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version %s
          /_/
    """ % sc.version)



Welcome to
          ____              __
         / __/__  ___ _____/ /__
        _\ \/ _ \/ _ `/ __/  '_/
       /__ / .__/\_,_/_/ /_/\_\   version 1.6.1
          /_/
    


In [2]:
review_rdd = sc.textFile('/user/pmolnar/yelp/data/review').sample(False, 0.01, 17)

In [3]:
review_rdd.first()

u'{"votes": {"funny": 0, "useful": 1, "cool": 0}, "user_id": "t6OkrXgpcAZfJa2om9QO4A", "review_id": "QSu0l7koHMlTIhWbiiKMxg", "stars": 3, "date": "2015-07-02", "text": "Friendly local bar with great service and good food,  won\'t be disappointed if you like bar food at a reasonable price.  Good place to go for \\"Friday fish sandwich \\"", "type": "review", "business_id": "KayYbHCt-RkbGcPdGOThNg"}'

In [4]:
rtrain_rdd, rtest_rdd = review_rdd.randomSplit([0.8, 0.2])

In [5]:
rtrain_rdd.count()

19918

In [1]:
text =  "Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road."

In [2]:
def text2words(text):
    import re
    def clean_text(text):
        return re.sub(r'[.;:,\'"]', ' ', unicode(text).lower())
    return filter(lambda x: x!='', clean_text(text).split(' '))

In [3]:
text2words(text)

[u'mr',
 u'hoagie',
 u'is',
 u'an',
 u'institution',
 u'walking',
 u'in',
 u'it',
 u'does',
 u'seem',
 u'like',
 u'a',
 u'throwback',
 u'to',
 u'30',
 u'years',
 u'ago',
 u'old',
 u'fashioned',
 u'menu',
 u'board',
 u'booths',
 u'out',
 u'of',
 u'the',
 u'70s',
 u'and',
 u'a',
 u'large',
 u'selection',
 u'of',
 u'food',
 u'their',
 u'speciality',
 u'is',
 u'the',
 u'italian',
 u'hoagie',
 u'and',
 u'it',
 u'is',
 u'voted',
 u'the',
 u'best',
 u'in',
 u'the',
 u'area',
 u'year',
 u'after',
 u'year',
 u'i',
 u'usually',
 u'order',
 u'the',
 u'burger',
 u'while',
 u'the',
 u'patties',
 u'are',
 u'obviously',
 u'cooked',
 u'from',
 u'frozen',
 u'all',
 u'of',
 u'the',
 u'other',
 u'ingredients',
 u'are',
 u'very',
 u'fresh',
 u'overall',
 u'its',
 u'a',
 u'good',
 u'alternative',
 u'to',
 u'subway',
 u'which',
 u'is',
 u'down',
 u'the',
 u'road']

In [7]:
def json_review(s):
    import json
    r = json.loads(s.strip())
    return (r['stars'], r['text'])

In [35]:
rtrain_rdd.map(json_review).take(10)

[(4,
  u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'),
 (5,
  u"Excellent food. Superb customer service. I miss the mario machines they used to have, but it's still a great place steeped in tradition."),
 (5,
  u'Yes this place is a little out dated and not opened on the weekend. But other than that the staff is always pleasant and fast to make your order. Which is always spot on fresh veggies on their hoggies and other food. They also have daily specials and ice cream which is really good. I had a banana split they piled the toppings on. They win pennysaver awards ever years i see 

In [8]:
##word_train_rdd = rtrain_rdd.flatMap(lambda r: [(r[0], w) for w in text2words(r[1])])
word_train_rdd = rtrain_rdd.map(json_review).flatMap(lambda r: [(r[0], w) for w in text2words(r[1])])

In [9]:
word_train_rdd.take(10) ## .groupByKey().take(10)

[(3, u'friendly'),
 (3, u'local'),
 (3, u'bar'),
 (3, u'with'),
 (3, u'great'),
 (3, u'service'),
 (3, u'and'),
 (3, u'good'),
 (3, u'food'),
 (3, u'won')]

In [14]:
import numpy as np

In [19]:
def stars_one_hot(r):
    import numpy as np
    s = np.zeros(5)
    s[r[0]-1] = 1
    return (r[1], s)

In [22]:
words_train_oh_rdd = word_train_rdd.map(stars_one_hot).take(10)

In [53]:
def sum_one_hot_stars(vs):
    n = 0
    sum_s = np.zeros(5)
    for v in vs:
        n += 1
        sum_s += v
    return (sum_s, n)

In [56]:
word_count = word_train_rdd\
    .map(stars_one_hot).groupByKey()\
    .map(lambda (k,vs): (k, sum_one_hot_stars(vs)))

In [57]:
word_count.take(4)

[(u'gag', (array([ 5.,  1.,  0.,  2.,  0.]), 8)),
 (u'deal!!', (array([ 0.,  0.,  0.,  1.,  0.]), 1)),
 (u'francesca', (array([ 0.,  0.,  0.,  0.,  1.]), 1)),
 (u'fingernails', (array([ 3.,  1.,  0.,  1.,  2.]), 7))]

In [52]:
sum_s = np.zeros(5)
for p in res[1]:
    sum_s += p
print sum_s

[ 5.  1.  0.  2.  0.]


In [70]:
rtrain_rdd.map(json_review).map(lambda t: (t[0], 1.0)).reduceByKey(lambda a,b: a+b).take(10)

[(1, 2580.0), (2, 1674.0), (3, 2350.0), (4, 4890.0), (5, 8424.0)]

In [48]:
word_count.take(4)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 20.0 failed 1 times, most recent failure: Lost task 0.0 in stage 20.0 (TID 130, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/hdp/2.4.2.0-258/spark/python/pyspark/rdd.py", line 1293, in takeUpToNumLeft
    yield next(iterator)
  File "<ipython-input-47-ff12827d2e6d>", line 1, in <lambda>
NameError: global name 'a' is not defined

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:313)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:277)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1855)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1881)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:393)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/hdp/2.4.2.0-258/spark/python/pyspark/rdd.py", line 1293, in takeUpToNumLeft
    yield next(iterator)
  File "<ipython-input-47-ff12827d2e6d>", line 1, in <lambda>
NameError: global name 'a' is not defined

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:313)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:277)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [33]:
t = np.zeros(5)
t[2] = 1

In [34]:
s+t

array([ 0.,  0.,  1.,  1.,  0.])

In [39]:
np.sum([s, t])

2.0

In [None]:
unicode.lower