In [4]:
import pyspark
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import hour, when, col, date_format, to_timestamp, create_map, lit
from pyspark.sql.functions import *
from pyspark.sql.types import StructType
from itertools import chain
from faker import Faker
import os


In [5]:
import findspark
findspark.find()

ValueError: Couldn't find Spark, make sure SPARK_HOME env is set or Spark is in an expected location (e.g. from homebrew installation).

In [6]:
memory = '4g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [7]:
sc = pyspark.SparkContext(appName="temp")
sqlContext = SQLContext(sc)

sp = SparkSession.builder.config("spark.driver.memory", "15g").appName('ReviewsSpark').getOrCreate()

#### Function to Load Data

In [8]:
def load_business_data():
    path = "archive/yelp_academic_dataset_business.json"
    df = sqlContext.read.json(path, multiLine=False)
    return df

def load_review_data():
    path = "archive/yelp_filtnosamp_gt5.csv"
    df = sp.read.option("header",True).csv(path)
    return df

In [9]:
df = load_business_data()

df2 = load_review_data()

#### Show Data and Schema

In [10]:
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|          city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|        921 Pearl St|{null, null, 'bee...|6iYb2HFDywm3zjuRg...|Gastropubs, Food,...|       Boulder|{11:0-23:0, 11:0-...|      1|   40.0175444|   -105.2833481| Oskar Blues Taproom|      80302|          86|  4.0|   CO|
| 7000 NE Airport Way|{null, null, u'be...|tCbdrRPZA0oiIYSmH...|Salad, Soup, Sand...|      Portland|{5:0-18:0, 5:0-18...|      1

In [11]:
df.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [None]:
df2.show()

In [14]:
df2.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- userID: string (nullable = true)
 |-- itemID: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- name: string (nullable = true)



# Business Data Manipulation

#### (TODO) Flattens the Structs so that all the Nested Columns are Normal Columns

In [None]:
##To be done
def flatten_df(df, prefix=""):
    return df

In [None]:
# df = df.select(flatten_df(df))
# df.show()

#### Clean up the Data and Keep Required Rows

In [None]:
def clean_data(df):
    '''
    input: df a dataframe
    output: df a dataframe with the all the original columns
    '''
    
    # START YOUR CODE HERE ---------
    df = df.select(["name", "latitude", "longitude", "city", "review_count", "stars", "attributes", "categories"])
    

    # END YOUR CODE HERE -----------
    return df



In [None]:
df = clean_data(df)
df.show()

#### Filter Entries with Empty Ratings Field

Since with this, we see that there are no restaraunts with null ratings, we know that we do not need to filter out any rows from this dataframe.

In [None]:
df.filter(df.stars.isNull()).show()

#### Filter the Data so Only Data from Atlanta is contained (temporarily)

In [None]:
df = df.filter(df.city == "Atlanta")
df.show()

# Reviews Data Manipulation

In [12]:
fakeGen = Faker() 
users = df2.select("userID").distinct().coalesce(1)
fakeNameMap = {}
for user in users.rdd.toLocalIterator():
    fakeNameMap[user["userID"]] = fakeGen.name()
    
mapping_expr = create_map([lit(x) for x in chain(*fakeNameMap.items())])

df2 = df2.withColumn("name", mapping_expr.getItem(col("userID")))


In [13]:
df2.toPandas().to_csv('yelp_filtnosamp_gt5_withname.csv')

In [None]:
df2.show()

In [16]:
business_df = df2.select("itemID").distinct().coalesce(1)
business_df.show()

+--------------------+
|              itemID|
+--------------------+
|66o2Fw42ZhGSuX1VF...|
|oLJWjd5VAkMbLU2e3...|
|_jw0beMekkOuCEGRx...|
|4QKuGnvjiPta_kk7J...|
|fSsdhoCC3FsXrSAAQ...|
|OuBUzqGj02xunvlIs...|
|bxy3khT-2R66tcdKj...|
|QAX9PI0-cAJN6x7rr...|
|6a8EOxICJtgzHViVm...|
|3ZVgig7uux9jVtEZn...|
|usrqG3sAANrQPvaHl...|
|Nqy2tJV3AGqW9Uil-...|
|cyvpFpmpN0YgDykuO...|
|Agq4zoNLSIpT1_ZJb...|
|czsrWGmQRDwP0tBid...|
|FbZLY5XASP9phBySt...|
|4SRTmovGJLmUgsfL8...|
|3gvHGMSHo4D8eXXSJ...|
|DZXp8m38R0s9U3Saj...|
|W4h9Tckj5WFJk1ve8...|
+--------------------+
only showing top 20 rows



In [None]:
inner_join = business_df.join(df, business_df.itemID == df.business_id)

In [20]:
inner_join.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|              itemID|             address|          attributes|         business_id|          categories|        city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|-36nnCT71XE0InJXK...|   2615 NE 112th Ave|{null, null, null...|-36nnCT71XE0InJXK...|Shopping, Drugsto...|   Vancouver|{6:0-0:0, 6:0-0:0...|      1|   45.6412317|   -122.5571137|             Safeway|      98684|          28|  2.0|   WA|
|-QOl03c2B22yi_On0...|    1860 Winderly Ln|{null, nu

AttributeError: 'DataFrame' object has no attribute 'print_schema'

In [21]:
inner_join.printSchema()

root
 |-- itemID: string (nullable = true)
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable

In [24]:
select_a = ["business_id", "postal_code", "longitude", "latitude", "name", "state", "stars", "address", "city"]
business_df = inner_join.select(select_a)
business_df.show()

+--------------------+-----------+---------------+-------------+--------------------+-----+-----+--------------------+------------+
|         business_id|postal_code|      longitude|     latitude|                name|state|stars|             address|        city|
+--------------------+-----------+---------------+-------------+--------------------+-----+-----+--------------------+------------+
|-36nnCT71XE0InJXK...|      98684|   -122.5571137|   45.6412317|             Safeway|   WA|  2.0|   2615 NE 112th Ave|   Vancouver|
|-QOl03c2B22yi_On0...|      43147|     -82.790437|    39.932087|Cracker Barrel Ol...|   OH|  3.0|    1860 Winderly Ln|Pickerington|
|-VVUUPK0ytYjpJ_S7...|    V5T 1J6|-123.1037478894|49.2665400102| Peaceful Restaurant|   BC|  3.0|     43 E 5th Avenue|   Vancouver|
|-ZzsPlaAgwO3yt29u...|      78708|    -97.7051165|   30.3911673|      ATX Architects|   TX|  5.0|                    |      Austin|
|-gdR559hH89jagbHz...|      02130|    -71.1065318|   42.3104844|    Evergree

In [25]:
business_df.toPandas().to_csv('business_list.csv')