In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import when

spark = SparkSession.builder.appName('Yelp Businesses EDA').getOrCreate()
sc = spark.sparkContext 

In [11]:
# read data and see schema 
df = spark.read.json('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/yelp_academic_dataset_business.json') 
parquet_file_bus='gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/business.snappy.parquet'
parquet_file_review='gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/review.snappy.parquet'
parquet_file_user='gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/user.snappy.parquet'
df1=spark.read.option('multiline','true').option("quote", "\"").option('escape','\"').option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true').parquet(parquet_file)
df_review=spark.read.option('multiline','true').option("quote", "\"").option('escape','\"').option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true').parquet(parquet_file_review)
df_user=spark.read.option('multiline','true').option("quote", "\"").option('escape','\"').option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true').parquet(parquet_file_user)

df.printSchema() 

                                                                                

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [12]:
df_user.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- yelping_since: string (nullable = true)



In [4]:
df1.show(3, truncate = False) 

[Stage 2:>                                                          (0 + 1) / 1]

+-------------------+----------------------+----------------------------------------------------------------------------+------------+-------+----------+-----------+------------------+-----------+------------+-----+-----+----------------+-----------+-------+--------+----+-----------+----------+-----------+----------------------+--------------------------+------------------------------------------------------------------------------------+-----------------+------+---------+-------+-------------------+-----------+---------+--------------+-----------+-----------+-----------------+---------+-----+-----+----------+-----------+--------------+-----------------+-------------------------+-------------------+------------------------+----------------------+-----------------------+-----------------------+------------------+-------+--------------------+-------+--------+--------+--------+--------+--------+--------+---------+
|address            |business_id           |categories                     

                                                                                

In [5]:
from pyspark.sql import functions as F 

# determine number of partitions 
def displayPartitions(df1): 
    num = df1.rdd.getNumPartitions() 
    df1.withColumn('partitionId', F.spark_partition_id()).groupby('partitionId').count().orderBy(F.asc('count')).show(num)  
    
print('Default parallelism:', sc.defaultParallelism)
print('Number of partitions:', df1.rdd.getNumPartitions(), '\n') 

df1 = df1.repartition((sc.defaultParallelism * 2)) 
displayPartitions(df1)

Default parallelism: 4
Number of partitions: 1 





+-----------+-----+
|partitionId|count|
+-----------+-----+
|          3| 4373|
|          5| 4373|
|          4| 4373|
|          1| 4373|
|          2| 4373|
|          6| 4374|
|          0| 4374|
|          7| 4374|
+-----------+-----+



                                                                                

In [6]:
from pyspark.sql import functions as F

# Unwrap nested struct columns
df = df.withColumn('AcceptsInsurance', F.col('attributes.AcceptsInsurance'))
df = df.withColumn('AgesAllowed', F.col('attributes.AgesAllowed'))
df = df.withColumn('Alcohol', F.col('attributes.Alcohol'))
df = df.withColumn('Ambience', F.col('attributes.Ambience'))
df = df.withColumn('BYOB', F.col('attributes.BYOB'))
df = df.withColumn('BYOBCorkage', F.col('attributes.BYOBCorkage'))
df = df.withColumn('BestNights', F.col('attributes.BestNights'))
df = df.withColumn('BikeParking', F.col('attributes.BikeParking'))
df = df.withColumn('BusinessAcceptsBitcoin', F.col('attributes.BusinessAcceptsBitcoin'))
df = df.withColumn('BusinessAcceptsCreditCards', F.col('attributes.BusinessAcceptsCreditCards'))
df = df.withColumn('BusinessParking', F.col('attributes.BusinessParking'))
df = df.withColumn('ByAppointmentOnly', F.col('attributes.ByAppointmentOnly'))
df = df.withColumn('Caters', F.col('attributes.Caters'))
df = df.withColumn('CoatCheck', F.col('attributes.CoatCheck'))
df = df.withColumn('Corkage', F.col('attributes.Corkage'))
df = df.withColumn('DietaryRestrictions', F.col('attributes.DietaryRestrictions'))
df = df.withColumn('DogsAllowed', F.col('attributes.DogsAllowed'))
df = df.withColumn('DriveThru', F.col('attributes.DriveThru'))
df = df.withColumn('GoodForDancing', F.col('attributes.GoodForDancing'))
df = df.withColumn('GoodForKids', F.col('attributes.GoodForKids'))
df = df.withColumn('GoodForMeal', F.col('attributes.GoodForMeal'))
df = df.withColumn('HairSpecializesIn', F.col('attributes.HairSpecializesIn'))
df = df.withColumn('HappyHour', F.col('attributes.HappyHour'))
df = df.withColumn('HasTV', F.col('attributes.HasTV'))
df = df.withColumn('Music', F.col('attributes.Music'))
df = df.withColumn('NoiseLevel', F.col('attributes.NoiseLevel'))
df = df.withColumn('Open24Hours', F.col('attributes.Open24Hours'))
df = df.withColumn('OutdoorSeating', F.col('attributes.OutdoorSeating'))
df = df.withColumn('RestaurantsAttire', F.col('attributes.RestaurantsAttire'))
df = df.withColumn('RestaurantsCounterService', F.col('attributes.RestaurantsCounterService'))
df = df.withColumn('RestaurantsDelivery', F.col('attributes.RestaurantsDelivery'))
df = df.withColumn('RestaurantsGoodForGroups', F.col('attributes.RestaurantsGoodForGroups'))
df = df.withColumn('RestaurantsPriceRange2', F.col('attributes.RestaurantsPriceRange2'))
df = df.withColumn('RestaurantsReservations', F.col('attributes.RestaurantsReservations'))
df = df.withColumn('RestaurantsTableService', F.col('attributes.RestaurantsTableService'))
df = df.withColumn('RestaurantsTakeOut', F.col('attributes.RestaurantsTakeOut'))
df = df.withColumn('Smoking', F.col('attributes.Smoking'))
df = df.withColumn('WheelchairAccessible', F.col('attributes.WheelchairAccessible'))
df = df.withColumn('WiFi', F.col('attributes.WiFi'))

df =df.drop('attributes', 'hours') 


In [7]:
df.printSchema()

root
 |-- address: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- city: string (nullable = true)
 |-- is_open: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- AcceptsInsurance: string (nullable = true)
 |-- AgesAllowed: string (nullable = true)
 |-- Alcohol: string (nullable = true)
 |-- Ambience: string (nullable = true)
 |-- BYOB: string (nullable = true)
 |-- BYOBCorkage: string (nullable = true)
 |-- BestNights: string (nullable = true)
 |-- BikeParking: string (nullable = true)
 |-- BusinessAcceptsBitcoin: string (nullable = true)
 |-- BusinessAcceptsCreditCards: string (nullable = true)
 |-- BusinessParking: string (nullable = true)
 |-- ByAppointmentOnly: strin

In [8]:
# # unwrap nested struct columns 

# df1 = df1.withColumn('AcceptsInsurance', F.col('attributes.AcceptsInsurance')) 
# df1 = df1.withColumn('AgesAllowed', F.col('attributes.AgesAllowed')) 
# df1 = df1.withColumn('Alcohol', F.col('attributes.Alcohol')) 
# df1 = df1.withColumn('Ambience', F.col('attributes.Ambience')) 
# df1 = df1.withColumn('BYOB', F.col('attributes.BYOB')) 
# df1 = df1.withColumn('BYOBCorkage', F.col('attributes.BYOBCorkage')) 
# df1 = df1.withColumn('BestNights', F.col('attributes.BestNights')) 
# df1 = df1.withColumn('BikeParking', F.col('attributes.BikeParking')) 
# df1 = df1.withColumn('BusinessAcceptsBitcoin', F.col('attributes.BusinessAcceptsBitcoin')) 
# df1 = df1.withColumn('BusinessAcceptsCreditCards', F.col('attributes.BusinessAcceptsCreditCards')) 
# df1 = df1.withColumn('BusinessParking', F.col('attributes.BusinessParking')) 
# df1 = df1.withColumn('ByAppointmentOnly', F.col('attributes.ByAppointmentOnly')) 
# df1 = df1.withColumn('Caters', F.col('attributes.Caters')) 
# df1 = df1.withColumn('CoatCheck', F.col('attributes.CoatCheck')) 
# df1 = df1.withColumn('Corkage', F.col('attributes.Corkage')) 
# df1 = df1.withColumn('DietaryRestrictions', F.col('attributes.DietaryRestrictions')) 
# df1 = df1.withColumn('DogsAllowed', F.col('attributes.DogsAllowed')) 
# df1 = df1.withColumn('DriveThru', F.col('attributes.DriveThru')) 
# df1 = df1.withColumn('Goodf1orDancing', F.col('attributes.Goodf1orDancing')) 
# df1 = df1.withColumn('Goodf1orKids', F.col('attributes.Goodf1orKids')) 
# df1 = df1.withColumn('Goodf1orMeal', F.col('attributes.Goodf1orMeal')) 
# df1 = df1.withColumn('HairSpecializesIn', F.col('attributes.HairSpecializesIn')) 
# df1 = df1.withColumn('HappyHour', F.col('attributes.HappyHour')) 
# df1 = df1.withColumn('HasTV', F.col('attributes.HasTV')) 
# df1 = df1.withColumn('Music', F.col('attributes.Music')) 
# df1 = df1.withColumn('NoiseLevel', F.col('attributes.NoiseLevel')) 
# df1 = df1.withColumn('Open24Hours', F.col('attributes.Open24Hours')) 
# df1 = df1.withColumn('OutdoorSeating', F.col('attributes.OutdoorSeating')) 
# df1 = df1.withColumn('RestaurantsAttire', F.col('attributes.RestaurantsAttire')) 
# df1 = df1.withColumn('RestaurantsCounterService', F.col('attributes.RestaurantsCounterService')) 
# df1 = df1.withColumn('RestaurantsDelivery', F.col('attributes.RestaurantsDelivery')) 
# df1 = df1.withColumn('RestaurantsGoodf1orGroups', F.col('attributes.RestaurantsGoodf1orGroups')) 
# df1 = df1.withColumn('RestaurantsPriceRange2', F.col('attributes.RestaurantsPriceRange2')) 
# df1 = df1.withColumn('RestaurantsReservations', F.col('attributes.RestaurantsReservations')) 
# df1 = df1.withColumn('RestaurantsTableService', F.col('attributes.RestaurantsTableService')) 
# df1 = df1.withColumn('RestaurantsTakeOut', F.col('attributes.RestaurantsTakeOut')) 
# df1 = df1.withColumn('Smoking', F.col('attributes.Smoking')) 
# df1 = df1.withColumn('WheelchairAccessible', F.col('attributes.WheelchairAccessible')) 
# df1 = df1.withColumn('WiFi', F.col('attributes.WiFi')) 

# df1 = df1.withColumn('Friday', F.col('hours.Friday')) 
# df1 = df1.withColumn('Monday', F.col('hours.Monday')) 
# df1 = df1.withColumn('Saturday', F.col('hours.Saturday')) 
# df1 = df1.withColumn('Sunday', F.col('hours.Sunday')) 
# df1 = df1.withColumn('Thursday', F.col('hours.Thursday')) 
# df1 = df1.withColumn('Tuesday', F.col('hours.Tuesday')) 
# df1 = df1.withColumn('Wednesday', F.col('hours.Wednesday')) 

# df1 = df1.drop('attributes', 'hours') 

In [11]:
from pyspark.sql import functions as F

# Unwrap nested struct columns
df1 = df1.withColumn('AcceptsInsurance', F.col('attributes.AcceptsInsurance'))
df1 = df1.withColumn('AgesAllowed', F.col('attributes.AgesAllowed'))
df1 = df1.withColumn('Alcohol', F.col('attributes.Alcohol'))
df1 = df1.withColumn('Ambience', F.col('attributes.Ambience'))
df1 = df1.withColumn('BYOB', F.col('attributes.BYOB'))
df1 = df1.withColumn('BYOBCorkage', F.col('attributes.BYOBCorkage'))
df1 = df1.withColumn('BestNights', F.col('attributes.BestNights'))
df1 = df1.withColumn('BikeParking', F.col('attributes.BikeParking'))
df1 = df1.withColumn('BusinessAcceptsBitcoin', F.col('attributes.BusinessAcceptsBitcoin'))
df1 = df1.withColumn('BusinessAcceptsCreditCards', F.col('attributes.BusinessAcceptsCreditCards'))
df1 = df1.withColumn('BusinessParking', F.col('attributes.BusinessParking'))
df1 = df1.withColumn('ByAppointmentOnly', F.col('attributes.ByAppointmentOnly'))
df1 = df1.withColumn('Caters', F.col('attributes.Caters'))
df1 = df1.withColumn('CoatCheck', F.col('attributes.CoatCheck'))
df1 = df1.withColumn('Corkage', F.col('attributes.Corkage'))
df1 = df1.withColumn('DietaryRestrictions', F.col('attributes.DietaryRestrictions'))
df1 = df1.withColumn('DogsAllowed', F.col('attributes.DogsAllowed'))
df1 = df1.withColumn('DriveThru', F.col('attributes.DriveThru'))
df1 = df1.withColumn('GoodForDancing', F.col('attributes.GoodForDancing'))
df1 = df1.withColumn('GoodForKids', F.col('attributes.GoodForKids'))
df1 = df1.withColumn('GoodForMeal', F.col('attributes.GoodForMeal'))
df1 = df1.withColumn('HairSpecializesIn', F.col('attributes.HairSpecializesIn'))
df1 = df1.withColumn('HappyHour', F.col('attributes.HappyHour'))
df1 = df1.withColumn('HasTV', F.col('attributes.HasTV'))
df1 = df1.withColumn('Music', F.col('attributes.Music'))
df1 = df1.withColumn('NoiseLevel', F.col('attributes.NoiseLevel'))
df1 = df1.withColumn('Open24Hours', F.col('attributes.Open24Hours'))
df1 = df1.withColumn('OutdoorSeating', F.col('attributes.OutdoorSeating'))
df1 = df1.withColumn('RestaurantsAttire', F.col('attributes.RestaurantsAttire'))
df1 = df1.withColumn('RestaurantsCounterService', F.col('attributes.RestaurantsCounterService'))
df1 = df1.withColumn('RestaurantsDelivery', F.col('attributes.RestaurantsDelivery'))
df1 = df1.withColumn('RestaurantsGoodForGroups', F.col('attributes.RestaurantsGoodForGroups'))
df1 = df1.withColumn('RestaurantsPriceRange2', F.col('attributes.RestaurantsPriceRange2'))
df1 = df1.withColumn('RestaurantsReservations', F.col('attributes.RestaurantsReservations'))
df1 = df1.withColumn('RestaurantsTableService', F.col('attributes.RestaurantsTableService'))
df1 = df1.drop('attributes', 'hours') 


AnalysisException: cannot resolve '`attributes.AcceptsInsurance`' given input columns: [AcceptsInsurance, AgesAllowed, Alcohol, Ambience, BYOB, BYOBCorkage, BestNights, BikeParking, BusinessAcceptsBitcoin, BusinessAcceptsCreditCards, BusinessParking, ByAppointmentOnly, Caters, CoatCheck, Corkage, DietaryRestrictions, DogsAllowed, DriveThru, Friday, GoodForDancing, GoodForKids, GoodForMeal, HairSpecializesIn, HappyHour, HasTV, Monday, Music, NoiseLevel, Open24Hours, OutdoorSeating, RestaurantsAttire, RestaurantsCounterService, RestaurantsDelivery, RestaurantsGoodForGroups, RestaurantsPriceRange2, RestaurantsReservations, RestaurantsTableService, RestaurantsTakeOut, Saturday, Smoking, Sunday, Thursday, Tuesday, Wednesday, WheelchairAccessible, WiFi, address, business_id, categories, city, is_open, latitude, longitude, name, postal_code, review_count, stars, state];
'Project [address#186, business_id#187, categories#188, city#189, is_open#190L, latitude#191, longitude#192, name#193, postal_code#194, review_count#195L, stars#196, state#197, 'attributes.AcceptsInsurance AS AcceptsInsurance#2124, AgesAllowed#199, Alcohol#200, Ambience#201, BYOB#202, BYOBCorkage#203, BestNights#204, BikeParking#205, BusinessAcceptsBitcoin#206, BusinessAcceptsCreditCards#207, BusinessParking#208, ByAppointmentOnly#209, ... 34 more fields]
+- Repartition 8, true
   +- Relation[address#186,business_id#187,categories#188,city#189,is_open#190L,latitude#191,longitude#192,name#193,postal_code#194,review_count#195L,stars#196,state#197,AcceptsInsurance#198,AgesAllowed#199,Alcohol#200,Ambience#201,BYOB#202,BYOBCorkage#203,BestNights#204,BikeParking#205,BusinessAcceptsBitcoin#206,BusinessAcceptsCreditCards#207,BusinessParking#208,ByAppointmentOnly#209,... 34 more fields] parquet


In [None]:
df1.printSchema()

In [None]:
from pyspark.ml.feature import StringIndexer

# Select the categorical columns for string indexing
categorical_columns = ["address", "business_id", "categories", "city", "name", "postal_code", "state",
                       "AcceptsInsurance", "AgesAllowed", "Alcohol", "Ambience", "BYOB", "BYOBCorkage",
                       "BestNights", "BikeParking", "BusinessAcceptsBitcoin", "BusinessAcceptsCreditCards",
                       "BusinessParking", "ByAppointmentOnly", "Caters", "CoatCheck", "Corkage",
                       "DietaryRestrictions", "DogsAllowed", "DriveThru", "GoodForDancing", "GoodForKids",
                       "GoodForMeal", "HairSpecializesIn", "HappyHour", "HasTV", "Music", "NoiseLevel",
                       "Open24Hours", "OutdoorSeating", "RestaurantsAttire", "RestaurantsCounterService",
                       "RestaurantsDelivery", "RestaurantsGoodForGroups", "RestaurantsPriceRange2",
                       "RestaurantsReservations", "RestaurantsTableService"]

# Create a StringIndexer for each categorical column
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in categorical_columns]

# Fit the StringIndexers to the data
indexer_models = [indexer.fit(df) for indexer in indexers]

# Apply string indexing to the data
df_indexed = df
for indexer_model in indexer_models:
    df_indexed = indexer_model.transform(df_indexed)

# Select the indexed columns for correlation analysis
indexed_columns = [col+"_index" for col in categorical_columns]

# Calculate the correlation between indexed columns and the "stars" column
correlation_matrix = df_indexed.stat.corr("stars", *indexed_columns)


In [None]:
correlation_matrix = {}

# Calculate the correlation between each indexed column and the "stars" column
for col in indexed_columns:
    correlation = df_indexed.stat.corr("stars", col)
    correlation_matrix[col] = correlation


In [33]:
print(indexed_columns)

['address_index', 'business_id_index', 'categories_index', 'city_index', 'name_index', 'postal_code_index', 'state_index', 'AcceptsInsurance_index', 'AgesAllowed_index', 'Alcohol_index', 'Ambience_index', 'BYOB_index', 'BYOBCorkage_index', 'BestNights_index', 'BikeParking_index', 'BusinessAcceptsBitcoin_index', 'BusinessAcceptsCreditCards_index', 'BusinessParking_index', 'ByAppointmentOnly_index', 'Caters_index', 'CoatCheck_index', 'Corkage_index', 'DietaryRestrictions_index', 'DogsAllowed_index', 'DriveThru_index', 'GoodForDancing_index', 'GoodForKids_index', 'GoodForMeal_index', 'HairSpecializesIn_index', 'HappyHour_index', 'HasTV_index', 'Music_index', 'NoiseLevel_index', 'Open24Hours_index', 'OutdoorSeating_index', 'RestaurantsAttire_index', 'RestaurantsCounterService_index', 'RestaurantsDelivery_index', 'RestaurantsGoodForGroups_index', 'RestaurantsPriceRange2_index', 'RestaurantsReservations_index', 'RestaurantsTableService_index']


In [11]:
df1.show(3)

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+-----------+-------+------------+--------------+--------------------+-----------+------------+-----+-----+----------------+-----------+----------+--------------------+----+-----------+--------------------+-----------+----------------------+--------------------------+--------------------+-----------------+------+---------+-------+-------------------+-----------+---------+--------------+-----------+--------------------+-----------------+---------+-----+--------------------+----------+-----------+--------------+-----------------+-------------------------+-------------------+------------------------+----------------------+-----------------------+-----------------------+
|             address|         business_id|          categories|       city|is_open|    latitude|     longitude|                name|postal_code|review_count|stars|state|AcceptsInsurance|AgesAllowed|   Alcohol|            Ambience|BYOB|BYOBCorkage|          Bes

                                                                                

In [26]:
# Select only the relevant columns
business_df1 = df1.select("business_id", "categories", "city", "stars", "review_count", "RestaurantsPriceRange2","Ambience").where(df1['is_open']==1)

# Remove duplicates and missing values
business_df1 = business_df1.dropDuplicates(["business_id"]).na.drop()

# Remove outliers
business_df1 = business_df1.filter("review_count > 5 and review_count < 1000 and stars > 1 and stars < 5 and RestaurantsPriceRange2 is not null")

In [27]:
business_df1.show(9)

[Stage 179:>                                                        (0 + 1) / 1]

+--------------------+--------------------+-------------+-----+------------+----------------------+--------------------+
|         business_id|          categories|         city|stars|review_count|RestaurantsPriceRange2|            Ambience|
+--------------------+--------------------+-------------+-----+------------+----------------------+--------------------+
|-0iIxySkp97WNlwK6...|Caterers, Sandwic...|         Reno|  3.5|         219|                     1|{'touristy': Fals...|
|-ZzCVD9Ge7KCy4ffh...|Restaurants, Food...|     Glenside|  4.0|          29|                     1|{'touristy': Fals...|
|-ajaASaDA_77I6pK3...|Mexican, Restaura...| Mount Laurel|  2.0|         101|                     1|{'romantic': Fals...|
|0wZJkj-OnZ7Pmubls...|Italian, French, ...|  New Orleans|  4.0|         506|                     1|{'touristy': Fals...|
|10KnzbTaz-Yq8wADX...|Nightlife, Arts &...|    Nashville|  4.0|          76|                     2|{'touristy': Fals...|
|1B59ZyvK_n4E1egxq...|American (

                                                                                

In [14]:
business_df1.select('categories').show()

+--------------------+
|          categories|
+--------------------+
|Caterers, Sandwic...|
|Restaurants, Food...|
|Mexican, Restaura...|
|Italian, French, ...|
|Nightlife, Arts &...|
|American (Traditi...|
|Restaurants, Amer...|
|Pizza, Nightlife,...|
|Southern, Restaur...|
|Caribbean, Restau...|
|Restaurants, Stea...|
|Restaurants, Seafood|
|Restaurants, Chin...|
|Event Planning & ...|
|Fast Food, Sandwi...|
|Restaurants, Amer...|
|Nightlife, Event ...|
|Cajun/Creole, Res...|
|Pizza, Restaurant...|
|Restaurants, Fast...|
+--------------------+
only showing top 20 rows



In [15]:
# from pyspark.sql.functions import udf
# from pyspark.sql.types import IntegerType, FloatType
# from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# # Create a feature engineering pipeline
# category_count = udf(lambda x: len(x.split(",")), IntegerType())
# category_avg_length = udf(lambda x: sum([len(c.strip()) for c in x.split(",")]) / len(x.split(",")), FloatType())

# indexer = StringIndexer(inputCol="RestaurantsPriceRange2", outputCol="PriceRangeIndex")
# encoder = OneHotEncoder(inputCol="PriceRangeIndex", outputCol="PriceRangeVec")

# assembler = VectorAssembler(inputCols=["category_count", "category_avg_length", "PriceRangeVec", "review_count"], outputCol="features")

# # Fit the feature engineering pipeline to the data
# business_df1 = business_df1.withColumn("category_count", category_count("categories"))
# business_df1 = business_df1.withColumn("category_avg_length", category_avg_length("categories"))

# business_df1 = indexer.fit(business_df1).transform(business_df1)
# business_df1 = encoder.fit(business_df1).transform(business_df1)
# business_df1 = assembler.transform(business_df1)


In [16]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# Create StringIndexer for the RestaurantsPriceRange2 column
price_range_indexer = StringIndexer(inputCol="RestaurantsPriceRange2", outputCol="PriceRangeIndex")

# Fit the indexer to the data
price_range_indexer_model = price_range_indexer.fit(business_df1)
business_df1 = price_range_indexer_model.transform(business_df1)

# Apply OneHotEncoder to the PriceRangeIndex column
price_range_encoder = OneHotEncoder(inputCols=["PriceRangeIndex"], outputCols=["PriceRangeVec"])
price_range_encoder_model = price_range_encoder.fit(business_df1)
business_df1 = price_range_encoder_model.transform(business_df1)

# Create StringIndexer for the categories column
category_indexer = StringIndexer(inputCol="categories", outputCol="category_index")

# Fit the indexer to the data
category_indexer_model = category_indexer.fit(business_df1)
business_df1 = category_indexer_model.transform(business_df1)

# Apply OneHotEncoder to the category_index column
category_encoder = OneHotEncoder(inputCols=["category_index"], outputCols=["category_vec"])
category_encoder_model = category_encoder.fit(business_df1)
business_df1 = category_encoder_model.transform(business_df1)

# Define the feature columns for VectorAssembler
feature_columns = ["PriceRangeVec", "review_count","category_vec"]

# Create VectorAssembler to combine features into a single vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Transform the data using the assembler
business_df1 = assembler.transform(business_df1)


                                                                                

In [17]:
business_df1.show(3)

23/05/17 15:32:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1703.2 KiB


+--------------------+--------------------+------------+-----+------------+----------------------+---------------+-------------+--------------+--------------------+--------------------+
|         business_id|          categories|        city|stars|review_count|RestaurantsPriceRange2|PriceRangeIndex|PriceRangeVec|category_index|        category_vec|            features|
+--------------------+--------------------+------------+-----+------------+----------------------+---------------+-------------+--------------+--------------------+--------------------+
|-0iIxySkp97WNlwK6...|Caterers, Sandwic...|        Reno|  3.5|         219|                     1|            1.0|(4,[1],[1.0])|        5254.0|(17595,[5254],[1.0])|(17600,[1,4,5259]...|
|-ZzCVD9Ge7KCy4ffh...|Restaurants, Food...|    Glenside|  4.0|          29|                     1|            1.0|(4,[1],[1.0])|       13496.0|(17595,[13496],[1...|(17600,[1,4,13501...|
|-ajaASaDA_77I6pK3...|Mexican, Restaura...|Mount Laurel|  2.0|        

                                                                                

In [9]:
# from pyspark.sql.functions import udf1
# from pyspark.sql.types import IntegerType, FloatType
# from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# # Create a feature engineering pipeline
# category_count = udf1(lambda x: len(x.split(",")), IntegerType())
# category_avg_length = udf1(lambda x: sum([len(c.strip()) for c in x.split(",")])/len(x.split(",")), FloatType())
# city_popularity = udf1(lambda x: city_popularity_dict[x], FloatType())

# indexer = StringIndexer(inputCol="RestaurantsPriceRange2", outputCol="PriceRangeIndex")
# encoder = OneHotEncoder(inputCol="PriceRangeIndex", outputCol="PriceRangeVec")

# assembler = VectorAssembler(inputCols=["category_count", "category_avg_length", "city_popularity", "PriceRangeVec", "review_count"], outputCol="features")

# # Fit the feature engineering pipeline to the data
# business_df1 = business_df1.withColumn("category_count", category_count("categories"))
# business_df1 = business_df1.withColumn("category_avg_length", category_avg_length("categories"))
# business_df1 = business_df1.withColumn("city_popularity", city_popularity("city"))

# business_df1 = indexer.fit(business_df1).transform(business_df1)
# business_df1 = encoder.fit(business_df1).transform(business_df1)
# business_df1 = assembler.transform(business_df1)


                                                                                

In [16]:
from pyspark.ml.regression import LinearRegression

# Split the data into training and testing sets
(training_data, testing_data) = business_df1.randomSplit([0.7, 0.3])

# Train the machine learning model
lr = LinearRegression(featuresCol="features", labelCol="stars", maxIter=100, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(training_data)

# Evaluate the machine learning model
predictions = lr_model.transform(testing_data)
mse = predictions.selectExpr("avg(pow(stars - prediction, 2))").first()[0]
r2 = lr_model.summary.r2


                                                                                

In [17]:
predictions.show(20)



+--------------------+--------------------+----------------+-----+------------+----------------------+--------------+-------------------+---------------+-------------+--------------------+------------------+
|         business_id|          categories|            city|stars|review_count|RestaurantsPriceRange2|category_count|category_avg_length|PriceRangeIndex|PriceRangeVec|            features|        prediction|
+--------------------+--------------------+----------------+-----+------------+----------------------+--------------+-------------------+---------------+-------------+--------------------+------------------+
|-ZzCVD9Ge7KCy4ffh...|Restaurants, Food...|        Glenside|  4.0|          29|                     1|             4|              11.75|            1.0|(4,[1],[1.0])|[4.0,11.75,0.0,1....|3.4828921945637696|
|0Y8xQvpbmO02SsFZt...|Restaurants, Fast...|       Bridgeton|  2.0|          19|                     1|             3|                9.0|            1.0|(4,[1],[1.0])|[

                                                                                

In [18]:
# from pyspark.ml.regression import DecisionTreeRegressor

# # Split the data into training and testing sets
# (training_data, testing_data) = business_df1.randomSplit([0.7, 0.3])

# # Create a DecisionTreeRegressor
# dt = DecisionTreeRegressor(featuresCol="features", labelCol="stars")

# # Train the decision tree model
# dt_model = dt.fit(training_data)

# # Make predictions on the testing data
# predictions = dt_model.transform(testing_data)

# # Evaluate the model
# mse = predictions.selectExpr("avg(pow(stars - prediction, 2))").first()[0]


                                                                                

In [18]:
from pyspark.ml.regression import DecisionTreeRegressor

# Split the data into training, validation, and testing sets
(training_data, validation_data, testing_data) = business_df1.randomSplit([0.7, 0.15, 0.15])

# Create a DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol="features", labelCol="stars")

# Train the decision tree model
dt_model = dt.fit(training_data)

# Make predictions on the validation data
validation_predictions = dt_model.transform(validation_data)

# Evaluate the model on the validation data
validation_mse = validation_predictions.selectExpr("avg(pow(stars - prediction, 2))").first()[0]

# Make predictions on the testing data
testing_predictions = dt_model.transform(testing_data)

# Evaluate the model on the testing data
testing_mse = testing_predictions.selectExpr("avg(pow(stars - prediction, 2))").first()[0]

# Print the evaluation metrics
print("Validation MSE:", validation_mse)
print("Testing MSE:", testing_mse)


23/05/17 15:33:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/05/17 15:33:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/05/17 15:33:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/05/17 15:33:37 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.1 MiB
23/05/17 15:33:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.1 MiB
23/05/17 15:34:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.1 MiB
23/05/17 15:34:36 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.1 MiB
23/05/17 15:35:06 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 5.1 MiB
23/05/17 15:35:55 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

Validation MSE: 0.5357591627414421
Testing MSE: 0.5398328655169513


                                                                                

In [19]:
testing_predictions.show()

23/05/17 15:36:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB


+--------------------+--------------------+------------+-----+------------+----------------------+---------------+-------------+--------------+--------------------+--------------------+------------------+
|         business_id|          categories|        city|stars|review_count|RestaurantsPriceRange2|PriceRangeIndex|PriceRangeVec|category_index|        category_vec|            features|        prediction|
+--------------------+--------------------+------------+-----+------------+----------------------+---------------+-------------+--------------+--------------------+--------------------+------------------+
|2fJ-WxJlUN6azp3bz...|Restaurants, Chin...|Philadelphia|  4.0|         485|                     1|            1.0|(4,[1],[1.0])|         511.0| (17595,[511],[1.0])|(17600,[1,4,516],...| 4.016081871345029|
|2y_CdkxEOJEJGyJAp...|Restaurants, Amer...|Woolwich Twp|  3.5|         104|                     2|            0.0|(4,[0],[1.0])|          25.0|  (17595,[25],[1.0])|(17600,[0,4,30],

In [20]:
validation_predictions.show(20)

23/05/17 15:36:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB


+--------------------+--------------------+-------------+-----+------------+----------------------+---------------+-------------+--------------+--------------------+--------------------+------------------+
|         business_id|          categories|         city|stars|review_count|RestaurantsPriceRange2|PriceRangeIndex|PriceRangeVec|category_index|        category_vec|            features|        prediction|
+--------------------+--------------------+-------------+-----+------------+----------------------+---------------+-------------+--------------+--------------------+--------------------+------------------+
|6WmOJ8ARLjDUvqtHu...|  Restaurants, Pizza|     Metairie|  4.0|         120|                     2|            0.0|(4,[0],[1.0])|           0.0|   (17595,[0],[1.0])|(17600,[0,4,5],[1...|3.7110181997048697|
|8IOOFeWwW9p8auITe...|Restaurants, Cafe...| Indianapolis|  4.0|          20|                     2|            0.0|(4,[0],[1.0])|         734.0| (17595,[734],[1.0])|(17600,[0,4

23/05/17 15:36:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB


In [32]:
predictions.show(20)

NameError: name 'predictions' is not defined

In [21]:
threshold = 3.5  # Example threshold value

predictions = testing_predictions.withColumn("predicted_label", when(testing_predictions["prediction"] >= threshold, 1).otherwise(0))
total_count = testing_predictions.count()
correct_count = predictions.filter((predictions["stars"] >= threshold) & (predictions["predicted_label"] == 1) |
                                   (predictions["stars"] < threshold) & (predictions["predicted_label"] == 0)).count()

accuracy = correct_count / total_count
tp = predictions.filter((predictions["stars"] >= threshold) & (predictions["predicted_label"] == 1)).count()
tn = predictions.filter((predictions["stars"] < threshold) & (predictions["predicted_label"] == 0)).count()
fp = predictions.filter((predictions["stars"] < threshold) & (predictions["predicted_label"] == 1)).count()
fn = predictions.filter((predictions["stars"] >= threshold) & (predictions["predicted_label"] == 0)).count()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# # Define the threshold for positive class
# threshold = 3.5

# # Convert the regression predictions to binary labels
# predictions = predictions.withColumn("predicted_label", when(predictions["prediction"] >= threshold, 1).otherwise(0))

# # Create a MulticlassClassificationEvaluator
# evaluator = MulticlassClassificationEvaluator(labelCol="stars", predictionCol="predicted_label", metricName="accuracy")

# # Calculate accuracy
# accuracy = evaluator.evaluate(predictions)

# # Calculate F1 score
# f1_score = evaluator.setMetricName("f1").evaluate(predictions)


23/05/17 15:36:18 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/05/17 15:36:20 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/05/17 15:36:22 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/05/17 15:36:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/05/17 15:36:26 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/05/17 15:36:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.6 MiB
                                                                                

In [22]:
f1_score

0.7007824064284205

In [23]:
recall

0.621530382595649

In [24]:
accuracy

0.6555501460564752

In [25]:
precision

0.8031992244304411