# Split Data Train/Test Text/Non-Text

### Setting Up Spark

In [1]:
import pyspark as ps
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType, ArrayType

In [2]:
spark = (ps.sql.SparkSession.builder
        .appName("Split_Data")
        .master('local[4]')
        .getOrCreate()
        )

sc = spark.sparkContext

In [3]:
spark

### Connecting To Data

In [4]:
data_location = "/home/jovyan/work/Documents/Data_Science_Projects/Yelp_Reviews/data/full_data/analytics_ready/"

In [5]:
filename = "all_data.json"

In [6]:
all_data = spark.read.json(data_location + filename)

### Explore all_data

In [7]:
all_data.printSchema()

root
 |-- biz_avg_stars: double (nullable = true)
 |-- biz_checkin_count: long (nullable = true)
 |-- biz_latitude: double (nullable = true)
 |-- biz_longitude: double (nullable = true)
 |-- biz_max_checkin_date: string (nullable = true)
 |-- biz_min_checkin_date: string (nullable = true)
 |-- biz_postal_code: string (nullable = true)
 |-- biz_review_count: long (nullable = true)
 |-- biz_state: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- review_text: string (nullable = true)
 |-- target_ufc_bool: string (nullable = true)
 |-- target_ufc_count: long (nullable = true)
 |-- user_avg_stars: double (nullable = true)
 |-- user_compliment_count: long (nullable = true)
 |-- user_elite_count: long (nullable = true)
 |-- user_elite_max: string (nullable = true)
 |-- user_elite_min: string (nullable = true)
 |-- user_fan_count: long (nullable = true)
 |-- user_friend_count: long (nul

In [8]:
all_data.show(5)

+-------------+-----------------+------------+-------------+--------------------+--------------------+---------------+----------------+---------+--------------------+--------------------+------------+--------------------+---------------+----------------+--------------+---------------------+----------------+--------------+--------------+--------------+-----------------+--------------------+-----------------+--------------------+
|biz_avg_stars|biz_checkin_count|biz_latitude|biz_longitude|biz_max_checkin_date|biz_min_checkin_date|biz_postal_code|biz_review_count|biz_state|         business_id|           review_id|review_stars|         review_text|target_ufc_bool|target_ufc_count|user_avg_stars|user_compliment_count|user_elite_count|user_elite_max|user_elite_min|user_fan_count|user_friend_count|             user_id|user_review_count|  user_yelping_since|
+-------------+-----------------+------------+-------------+--------------------+--------------------+---------------+----------------+-

In [9]:
all_data.count()

8635403

In [10]:
all_data.createOrReplaceTempView("all_data")

### Split Data Into Working and Holdout Sets

In [11]:
working_data, holdout_data = all_data.randomSplit([0.8, 0.2], seed=12345)

In [12]:
working_data.count()

6907890

In [13]:
holdout_data.count()

1727513

### Split Data Into Text and Non-Text

In [15]:
working_data.createOrReplaceTempView("working_data")

In [16]:
text_data = spark.sql('''
                          SELECT review_id,
                              review_stars,
                              review_text,
                              target_ufc_bool,
                              target_ufc_count
                          FROM working_data
                      ''')

In [17]:
text_data.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- review_text: string (nullable = true)
 |-- target_ufc_bool: string (nullable = true)
 |-- target_ufc_count: long (nullable = true)



In [18]:
text_data.count()

6907890

In [22]:
text_data.show(5)

+--------------------+------------+--------------------+---------------+----------------+
|           review_id|review_stars|         review_text|target_ufc_bool|target_ufc_count|
+--------------------+------------+--------------------+---------------+----------------+
|HfHK-fFTRfIIyQqPm...|         1.0|For some reason i...|           True|               1|
|KJi3rMjdADYFM04us...|         1.0|I wish there was ...|           True|               1|
|RkSTjgrwEuLsNCaqZ...|         1.0|The doctor was EX...|           True|               1|
|bfzrGhNHUmwIjG-xj...|         1.0|I tried to find a...|           True|               4|
|jJavFkU09T_aAcEuv...|         1.0|I have been at th...|          False|               0|
+--------------------+------------+--------------------+---------------+----------------+
only showing top 5 rows



In [26]:
non_text_data = spark.sql('''
                              SELECT review_id,
                                  user_id,
                                  business_id,
                                  review_stars,
                                  biz_avg_stars,
                                  biz_review_count,
                                  biz_checkin_count,
                                  biz_max_checkin_date,
                                  biz_min_checkin_date,
                                  biz_latitude,
                                  biz_longitude,
                                  biz_postal_code,
                                  biz_state,
                                  user_avg_stars,
                                  user_review_count,
                                  user_friend_count,
                                  user_fan_count,
                                  user_compliment_count,
                                  user_elite_count,
                                  user_elite_max,
                                  user_elite_min,
                                  user_yelping_since,
                                  target_ufc_bool,
                                  target_ufc_count
                              FROM working_data
                          ''')

In [27]:
non_text_data.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- biz_avg_stars: double (nullable = true)
 |-- biz_review_count: long (nullable = true)
 |-- biz_checkin_count: long (nullable = true)
 |-- biz_max_checkin_date: string (nullable = true)
 |-- biz_min_checkin_date: string (nullable = true)
 |-- biz_latitude: double (nullable = true)
 |-- biz_longitude: double (nullable = true)
 |-- biz_postal_code: string (nullable = true)
 |-- biz_state: string (nullable = true)
 |-- user_avg_stars: double (nullable = true)
 |-- user_review_count: long (nullable = true)
 |-- user_friend_count: long (nullable = true)
 |-- user_fan_count: long (nullable = true)
 |-- user_compliment_count: long (nullable = true)
 |-- user_elite_count: long (nullable = true)
 |-- user_elite_max: string (nullable = true)
 |-- user_elite_min: string (nullable = true)
 |-- user_yelping_since: string (nul

In [28]:
non_text_data.show(5)

+--------------------+--------------------+--------------------+------------+-------------+----------------+-----------------+--------------------+--------------------+------------+-------------+---------------+---------+--------------+-----------------+-----------------+--------------+---------------------+----------------+--------------+--------------+--------------------+---------------+----------------+
|           review_id|             user_id|         business_id|review_stars|biz_avg_stars|biz_review_count|biz_checkin_count|biz_max_checkin_date|biz_min_checkin_date|biz_latitude|biz_longitude|biz_postal_code|biz_state|user_avg_stars|user_review_count|user_friend_count|user_fan_count|user_compliment_count|user_elite_count|user_elite_max|user_elite_min|  user_yelping_since|target_ufc_bool|target_ufc_count|
+--------------------+--------------------+--------------------+------------+-------------+----------------+-----------------+--------------------+--------------------+----------

In [29]:
non_text_data.count()

6907890

### Save Data

In [37]:
text_data.coalesce(1).write.json(path='text_data.json')

In [38]:
non_text_data.coalesce(1).write.csv(path='non_text_data.csv')

In [39]:
holdout_data.coalesce(1).write.json(path='holdout_data.json')