## 1. Import data

In [124]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ReadFromGCS").getOrCreate()

# Read json from GCS
California_review_df = spark.read.json("gs://msca-bdp-student-gcs/Group_5_final_project/review-California.json")
Illinois_review_df = spark.read.json("gs://msca-bdp-student-gcs/Group_5_final_project/review-Illinois.json")
New_York_review_df = spark.read.json("gs://msca-bdp-student-gcs/Group_5_final_project/review-New_York.json")

                                                                                

In [125]:
# Read json from GCS
Meta_California = spark.read.json("gs://msca-bdp-student-gcs/Group_5_final_project/meta-California.json")
Meta_Illinois = spark.read.json("gs://msca-bdp-student-gcs/Group_5_final_project/meta-Illinois.json")
Meta_New_York = spark.read.json("gs://msca-bdp-student-gcs/Group_5_final_project/meta-New_York.json")

                                                                                

In [126]:
California_review_df.printSchema()
California_review_df.show(5)

root
 |-- gmap_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pics: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- rating: long (nullable = true)
 |-- resp: struct (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- time: long (nullable = true)
 |-- text: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)

+--------------------+-----------------+----+------+----+--------------------+-------------+--------------------+
|             gmap_id|             name|pics|rating|resp|                text|         time|             user_id|
+--------------------+-----------------+----+------+----+--------------------+-------------+--------------------+
|0x80c2c98c0e3c16f...|    Michael Rizal|null|     5|null|Great company. Am...|1599164133778|11316555113047622...|
|0x80c2c98c0e3c16

In [127]:
combined_review_df = California_review_df.unionByName(Illinois_review_df).unionByName(New_York_review_df)

review_column_to_exclude = ["pics"]
review_column_to_keep = [col for col in combined_review_df.columns if col not in review_column_to_exclude]

review_df = combined_review_df.select(review_column_to_keep)
review_df.show(5)

+--------------------+-----------------+------+----+--------------------+-------------+--------------------+
|             gmap_id|             name|rating|resp|                text|         time|             user_id|
+--------------------+-----------------+------+----+--------------------+-------------+--------------------+
|0x80c2c98c0e3c16f...|    Michael Rizal|     5|null|Great company. Am...|1599164133778|11316555113047622...|
|0x80c2c98c0e3c16f...|Faranak Rafizadeh|     5|null| Nice people
helpful|1618261672851|10122637137063761...|
|0x80c2c98c0e3c16f...|     Javier Perez|     5|null|                null|1524515066787|11116770366698148...|
|0x80c2c98c0e3c16f...|          Luis P.|     5|null|                null|1499739139293|11623088205734402...|
|0x80c2c98c0e3c16f...|   His Mama Cakez|     3|null|                null|1621410940638|10418165983558523...|
+--------------------+-----------------+------+----+--------------------+-------------+--------------------+
only showing top 5 

In [128]:
review_df.printSchema()

root
 |-- gmap_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- resp: struct (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- time: long (nullable = true)
 |-- text: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



In [129]:
Meta_California.printSchema()
Meta_California.show(5)

root
 |-- MISC: struct (nullable = true)
 |    |-- Accessibility: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Activities: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Amenities: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Atmosphere: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Crowd: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Dining options: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- From the business: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Getting here: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Health & safety: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Health and safety: array (nullable = true)


In [130]:
Meta_California.select("MISC").show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|MISC                                                                                                                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|null                                                                                                                                                                                     |
|{[Wheelchair accessible entrance], null, [Good for kids], [Casual], null, null, null, null, null, null, null, null, [Comfort food], null, null, null, null, [Takeout, Dine-in, Delivery]}|
|{null, null, null, null, null, null, null, null, null, null

## 2. Data transformation

#### Separate MISC

In [131]:
from pyspark.sql.functions import col

def flatten_misc(df):
    misc_cols = [c for c in df.select("MISC.*").columns]
    for mc in misc_cols:
        safe_mc = mc.replace(" ", "_").replace("&", "and")
        df = df.withColumn(f"MISC_{safe_mc}", col(f"`MISC`.`{mc}`"))
    return df.drop("MISC")

Meta_California = flatten_misc(Meta_California)
Meta_Illinois   = flatten_misc(Meta_Illinois)
Meta_New_York   = flatten_misc(Meta_New_York)

combined_meta_df = (
    Meta_California
    .unionByName(Meta_Illinois, allowMissingColumns=True)
    .unionByName(Meta_New_York, allowMissingColumns=True)
)

In [132]:
meta_column_to_exclude = ["url"]
meta_column_to_keep = [col for col in combined_meta_df.columns if col not in meta_column_to_exclude]

meta_df = combined_meta_df.select(meta_column_to_keep)
meta_df.show(5)

+--------------------+----------+-------------------+-----------+--------------------+--------------------+----------+------------+--------------------+--------------+-----+--------------------+--------------------+--------------------+---------------+---------------+---------------+----------+-------------------+----------------------+-----------------+----------------------+---------------+--------------------+--------------+--------------------+-------------+----------------+--------------+--------------------+
|             address|avg_rating|           category|description|             gmap_id|               hours|  latitude|   longitude|                name|num_of_reviews|price|    relative_results|               state|  MISC_Accessibility|MISC_Activities| MISC_Amenities|MISC_Atmosphere|MISC_Crowd|MISC_Dining_options|MISC_From_the_business|MISC_Getting_here|MISC_Health_and_safety|MISC_Highlights|MISC_Lodging_options|MISC_Offerings|       MISC_Payments|MISC_Planning|MISC_Popular_f

#### Rename the column to avoid duplicate column names.

In [133]:
review_df = review_df.withColumnRenamed("name", "cust_name")

In [134]:
meta_df = meta_df.withColumnRenamed("name", "store_name")

#### DATA QUALITY CHECKS

In [135]:
print("Review DF count:", review_df.count())
review_df.printSchema()



Review DF count: 127086576
root
 |-- gmap_id: string (nullable = true)
 |-- cust_name: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- resp: struct (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- time: long (nullable = true)
 |-- text: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



                                                                                

In [136]:
print("Meta DF count:", meta_df.count())
meta_df.printSchema()



Meta DF count: 967355
root
 |-- address: string (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- gmap_id: string (nullable = true)
 |-- hours: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- store_name: string (nullable = true)
 |-- num_of_reviews: long (nullable = true)
 |-- price: string (nullable = true)
 |-- relative_results: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- MISC_Accessibility: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- MISC_Activities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- MISC_Amenities: array (nullable = true)
 |    |-- eleme

                                                                                

#### Store count by category

In [137]:
from pyspark.sql.functions import col

meta_df.groupBy("category") \
       .count() \
       .orderBy(col("count").desc()) \
       .show(20, truncate=False)



+------------------------------------------------------------------------------+-----+
|category                                                                      |count|
+------------------------------------------------------------------------------+-----+
|[Restaurant]                                                                  |14475|
|[Gas station]                                                                 |9725 |
|[Auto repair shop]                                                            |8870 |
|[Nail salon]                                                                  |8844 |
|[Park]                                                                        |8322 |
|[Mexican restaurant]                                                          |7991 |
|[Barber shop]                                                                 |7976 |
|[Park, Tourist attraction]                                                    |7575 |
|[Beauty salon]                            

                                                                                

#### Select categories that contain "restaurant"

In [138]:
from pyspark.sql.functions import expr

meta_df = meta_df.filter(
    expr("exists(category, x -> lower(x) LIKE '%restaurant%')")
)

In [139]:
# Allow very long strings to display
spark.conf.set("spark.sql.debug.maxToStringFields", 10000)
spark.conf.set("spark.sql.showStringLength", 1000000)

meta_df.select("address").show(3, truncate=False)

+-------------------------------------------------------+
|address                                                |
+-------------------------------------------------------+
|San Soo Dang, 761 S Vermont Ave, Los Angeles, CA 90005 |
|Vons Chicken, 12740 La Mirada Blvd, La Mirada, CA 90638|
|TACOS LA CABANA, 2015 22nd Ave, Oakland, CA 94606      |
+-------------------------------------------------------+
only showing top 3 rows



In [140]:
print("Meta DF count:", meta_df.count())

Meta DF count: 171589


                                                                                

#### Separate address

In [141]:
# Read ZIP code CSV from GCS
zip_df = spark.read.csv(
    "gs://msca-bdp-student-gcs/Group_5_final_project/zip_code_database.csv",
    header=True,
    inferSchema=False  # Optional: automatically detect types
)

zip_df = zip_df.withColumnRenamed("zip", "zip_lookup")\
                .withColumnRenamed("state", "zip_state")\
                .withColumnRenamed("county", "zip_county")\
                    .withColumnRenamed("primary_city", "zip_city")\
# Show sample rows
zip_df.show(10, truncate=False)

+----------+----------+---------+-------------------+------------------------+
|zip_lookup|zip_city  |zip_state|zip_county         |irs_estimated_population|
+----------+----------+---------+-------------------+------------------------+
|00501     |Holtsville|NY       |Suffolk County     |562                     |
|00544     |Holtsville|NY       |Suffolk County     |0                       |
|00601     |Adjuntas  |PR       |Adjuntas Municipio |0                       |
|00602     |Aguada    |PR       |Aguada Municipio   |0                       |
|00603     |Aguadilla |PR       |Aguadilla Municipio|0                       |
|00604     |Aguadilla |PR       |null               |0                       |
|00605     |Aguadilla |PR       |null               |0                       |
|00606     |Maricao   |PR       |Maricao Municipio  |0                       |
|00610     |Anasco    |PR       |Anasco Municipio   |0                       |
|00611     |Angeles   |PR       |Utuado Municipio   

In [142]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import re

# UDF to extract the last valid 5-digit zip in CA, NY, IL ranges
def extract_valid_zip(address):
    if not address:
        return None
    zips = re.findall(r'\b\d{5}\b', address)
    for z in reversed(zips):  # check from the end
        z_int = int(z)
        if 90000 <= z_int <= 96199:   # CA
            return z
        elif 10000 <= z_int <= 14999: # NY
            return z
        elif 60000 <= z_int <= 62999: # IL
            return z
    return None

extract_zip_udf = udf(extract_valid_zip, StringType())

meta_zip_df = meta_df.withColumn("zip", extract_zip_udf(col("address")))

meta_zip_df.select("address", "zip").show(10, truncate=False)

+-----------------------------------------------------------------------------------------------+-----+
|address                                                                                        |zip  |
+-----------------------------------------------------------------------------------------------+-----+
|San Soo Dang, 761 S Vermont Ave, Los Angeles, CA 90005                                         |90005|
|Vons Chicken, 12740 La Mirada Blvd, La Mirada, CA 90638                                        |90638|
|TACOS LA CABANA, 2015 22nd Ave, Oakland, CA 94606                                              |94606|
|Mariscos el poblano, 5401-5441 Coliseum Way, Oakland, CA 94601                                 |94601|
|Off The Hoof, 201 E 4th St, Santa Ana, CA 92701                                                |92701|
|La Potranca, 12821 Venice Blvd., Los Angeles, CA 90066                                         |90066|
|Atlantis Burgers, 5853 Imperial Hwy, South Gate, CA 90280      

In [143]:
from pyspark.sql.functions import regexp_extract, col

# 4. Left join meta_df_clean with zip_df
meta_zip_df = meta_zip_df.join(
    zip_df,
    meta_zip_df.zip == zip_df.zip_lookup,
    how="left"
)

# 5. Drop duplicate column and rename for clarity
meta_zip_df = meta_zip_df.drop("zip_lookup")


# 6. Show enriched data
meta_zip_df.select("address", "zip", "zip_city", "zip_state", "zip_county", "irs_estimated_population")\
    .show(10)

[Stage 455:>                                                        (0 + 1) / 1]

+--------------------+-----+-------------+---------+------------------+------------------------+
|             address|  zip|     zip_city|zip_state|        zip_county|irs_estimated_population|
+--------------------+-----+-------------+---------+------------------+------------------------+
|San Soo Dang, 761...|90005|  Los Angeles|       CA|Los Angeles County|                   27330|
|Vons Chicken, 127...|90638|    La Mirada|       CA|Los Angeles County|                   41100|
|TACOS LA CABANA, ...|94606|      Oakland|       CA|    Alameda County|                   27530|
|Mariscos el pobla...|94601|      Oakland|       CA|    Alameda County|                   40720|
|Off The Hoof, 201...|92701|    Santa Ana|       CA|     Orange County|                   40280|
|La Potranca, 1282...|90066|  Los Angeles|       CA|Los Angeles County|                   44640|
|Atlantis Burgers,...|90280|   South Gate|       CA|Los Angeles County|                   79010|
|Cowboy Burgers & ...|91706| B

                                                                                

In [144]:
from pyspark.sql.functions import col

# Top 20 states by count
meta_zip_df.groupBy("zip_state") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(20, truncate=False)

# Top 20 cities by count
meta_zip_df.groupBy("zip_city") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(20, truncate=False)

                                                                                

+---------+-----+
|zip_state|count|
+---------+-----+
|CA       |92493|
|NY       |50047|
|IL       |28771|
|null     |278  |
+---------+-----+





+-------------+-----+
|zip_city     |count|
+-------------+-----+
|New York     |10461|
|Chicago      |8035 |
|Los Angeles  |7101 |
|Brooklyn     |6198 |
|San Francisco|4248 |
|San Diego    |3960 |
|San Jose     |2122 |
|Bronx        |2053 |
|Sacramento   |1887 |
|Buffalo      |1373 |
|Fresno       |1261 |
|Long Beach   |1237 |
|Rochester    |1165 |
|Bakersfield  |1109 |
|Oakland      |1105 |
|Anaheim      |906  |
|Staten Island|874  |
|Flushing     |786  |
|Riverside    |773  |
|Santa Ana    |711  |
+-------------+-----+
only showing top 20 rows



                                                                                

In [145]:
from pyspark.sql.functions import col

# Show addresses where zip_state is null
meta_zip_df.filter(col("zip_state").isNull()) \
           .select("address", "store_name", "zip") \
           .show(20, truncate=False)

+--------------------------------------------------------+--------------------------------------------------------------+-----+
|address                                                 |store_name                                                    |zip  |
+--------------------------------------------------------+--------------------------------------------------------------+-----+
|null                                                    |Bull&Dragon                                                   |null |
|Yaki Tori, 6502 Westminster Blvd., Westminster, CA 92687|Yaki Tori                                                     |92687|
|Kirkwood Sunrise Grill, California                      |Kirkwood Sunrise Grill                                        |null |
|null                                                    |PowR Meals                                                    |null |
|null                                                    |Beans BBQ Catering - Santa Barbara            

#### Filter out zip state that is null

In [146]:
from pyspark.sql.functions import col

meta_zip_df = meta_zip_df.filter(col("zip_state").isNotNull())

In [147]:
from pyspark.sql.functions import col

# Show addresses where zip_state is null
meta_zip_df.filter(col("zip_state").isNull()) \
           .select("address", "store_name", "zip") \
           .show(20, truncate=False)

+-------+----------+---+
|address|store_name|zip|
+-------+----------+---+
+-------+----------+---+



#### Filter out valid restaurant reviews

In [148]:
review_df = review_df.join(
    meta_zip_df.select("gmap_id").distinct(),
    on="gmap_id",
    how="left_semi"
)

In [149]:
print("Review DF count:", review_df.count())



Review DF count: 52215635


                                                                                

#### Check and remove duplicate rows

In [150]:
meta_df_duplicate_count = meta_zip_df.count() - meta_zip_df.dropDuplicates().count()

print("Meta_df Number of duplicated row: {}".format(meta_df_duplicate_count))



Meta_df Number of duplicated row: 250


                                                                                

In [151]:
review_df_duplicate_count = review_df.count() - review_df.dropDuplicates().count()

print("Review_df Number of duplicated row: {}".format(review_df_duplicate_count))



Review_df Number of duplicated row: 630060


                                                                                

In [152]:
meta_df_clean = meta_zip_df.dropDuplicates()

In [153]:
review_df_clean = review_df.dropDuplicates()

In [154]:
# check is the data cleaned
cleaned_meta_df_duplicate_count = meta_df_clean.count() - meta_df_clean.dropDuplicates().count()

print("Cleaned_Meta_df Number of duplicated row: {}".format(cleaned_meta_df_duplicate_count))



Cleaned_Meta_df Number of duplicated row: 0


                                                                                

In [155]:
# check is the data cleaned
cleaned_review_df_duplicate_count = review_df_clean.count() - review_df_clean.dropDuplicates().count()

print("Cleaned_Meta_df Number of duplicated row: {}".format(cleaned_review_df_duplicate_count))



Cleaned_Meta_df Number of duplicated row: 0


                                                                                

#### Check missing Values

In [156]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, ArrayType

def missing_summary_safe(df):
    summary_cols = []
    for field in df.schema.fields:
        c = field.name
        dtype = field.dataType
        # For strings: check null only (or optionally check empty string separately)
        if isinstance(dtype, StructType) or isinstance(dtype, ArrayType):
            # Only check for null for complex types
            summary_cols.append(F.count(F.when(F.col(c).isNull(), c)).alias(c))
        elif "string" in str(dtype):
            summary_cols.append(F.count(F.when(F.col(c).isNull() | (F.col(c) == ""), c)).alias(c))
        else:
            # For numbers: just check nulls
            summary_cols.append(F.count(F.when(F.col(c).isNull(), c)).alias(c))
    return df.select(summary_cols)


In [157]:
missing_summary_safe(review_df_clean).show()
missing_summary_safe(meta_df_clean).show()

                                                                                

+-------+---------+------+--------+--------+----+-------+
|gmap_id|cust_name|rating|    resp|    text|time|user_id|
+-------+---------+------+--------+--------+----+-------+
|      0|        0| 20232|47425668|23302069|   0|  20232|
+-------+---------+------+--------+--------+----+-------+





+-------+----------+--------+-----------+-------+-----+--------+---------+----------+--------------+-----+----------------+-----+------------------+---------------+--------------+---------------+----------+-------------------+----------------------+-----------------+----------------------+---------------+--------------------+--------------+-------------+-------------+----------------+--------------+--------------------+---+--------+---------+----------+------------------------+
|address|avg_rating|category|description|gmap_id|hours|latitude|longitude|store_name|num_of_reviews|price|relative_results|state|MISC_Accessibility|MISC_Activities|MISC_Amenities|MISC_Atmosphere|MISC_Crowd|MISC_Dining_options|MISC_From_the_business|MISC_Getting_here|MISC_Health_and_safety|MISC_Highlights|MISC_Lodging_options|MISC_Offerings|MISC_Payments|MISC_Planning|MISC_Popular_for|MISC_Recycling|MISC_Service_options|zip|zip_city|zip_state|zip_county|irs_estimated_population|
+-------+----------+--------+-----

                                                                                

In [158]:
from pyspark.sql.functions import col, count

meta_df_clean.groupBy("price") \
             .agg(count("*").alias("num_businesses")) \
             .orderBy("num_businesses", ascending=False) \
             .show(truncate=False)

meta_df_clean.select("avg_rating").summary().show()

                                                                                

+-----+--------------+
|price|num_businesses|
+-----+--------------+
|$    |63251         |
|$$   |54145         |
|null |48447         |
|$$$  |3332          |
|₩₩   |666           |
|₩    |609           |
|$$$$ |571           |
|₩₩₩  |34            |
|₩₩₩₩ |6             |
+-----+--------------+





+-------+------------------+
|summary|        avg_rating|
+-------+------------------+
|  count|            171061|
|   mean| 4.198608683452104|
| stddev|0.4222831945263846|
|    min|               1.0|
|    25%|               4.0|
|    50%|               4.3|
|    75%|               4.5|
|    max|               5.0|
+-------+------------------+



                                                                                

## 3. Join review data and meta data

In [159]:
combined_df = review_df_clean.join(meta_df_clean, on="gmap_id", how="inner")

In [160]:
combined_df.show(3)



+--------------------+------------+------+----+--------------------+-------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----------+------------+-------------------+--------------+-----+--------------------+-----+--------------------+---------------+--------------------+---------------+-----------------+--------------------+----------------------+-----------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------+--------------------+-----+----------+---------+------------------+------------------------+
|             gmap_id|   cust_name|rating|resp|                text|         time|             user_id|             address|avg_rating|            category|         description|               hours|  latitude|   longitude|         store_name|num_of_reviews|price|    relative_results|state|  MISC_Access

                                                                                

In [161]:
meta_df_clean.show(3)



+--------------------+----------+--------------------+-----------+--------------------+--------------------+------------------+-------------------+--------------------+--------------+-----+--------------------+-------------------+--------------------+---------------+--------------------+---------------+------------------+-------------------+----------------------+-----------------+----------------------+---------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------+--------------------+-----+----------+---------+-------------+------------------------+
|             address|avg_rating|            category|description|             gmap_id|               hours|          latitude|          longitude|          store_name|num_of_reviews|price|    relative_results|              state|  MISC_Accessibility|MISC_Activities|      MISC_Amenities|MISC_Atmosphere|        MISC_Crowd|MISC_Dining_options|MISC_From_the_business|MISC_Getti

                                                                                

In [162]:
review_df_clean.show(3)



+--------------------+---------------+------+----+--------------------+-------------+--------------------+
|             gmap_id|      cust_name|rating|resp|                text|         time|             user_id|
+--------------------+---------------+------+----+--------------------+-------------+--------------------+
|0x80dd3124f5bec85...|   Chris Lester|     1|null|Sat down and orde...|1491597356577|10984689565900977...|
|0x80c34d63ccfc87a...|Shawna Spellacy|     1|null|Do NOT go here if...|1624520166650|11271361955155364...|
|0x8085bc9abc80160...|    Kitson Wood|     3|null|The chicken was t...|1569035215070|11083113466091150...|
+--------------------+---------------+------+----+--------------------+-------------+--------------------+
only showing top 3 rows



                                                                                

In [163]:
print("meta_df_clean len:", meta_df_clean.count())



meta_df_clean len: 171061


                                                                                

In [164]:
print("review_df_clean len", review_df_clean.count())



review_df_clean len 51585575


                                                                                

In [165]:
print("combined_df len", combined_df.count())



combined_df len 51585575


                                                                                

In [166]:
meta_df_clean.write.mode("overwrite").parquet("gs://msca-bdp-student-gcs/Group_5_final_project/cleaned_restaurant_meta_df/")

                                                                                

In [167]:
review_df_clean.write.mode("overwrite").parquet("gs://msca-bdp-student-gcs/Group_5_final_project/cleaned_restaurant_review_df/")

                                                                                

In [168]:
combined_df.write.mode("overwrite").parquet("gs://msca-bdp-student-gcs/Group_5_final_project/combined_rest_df/") 

                                                                                