# Init Spark

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = ( 
    SparkSession.builder
            .appName('test').master("yarn")
            .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/22 11:35:16 INFO SparkEnv: Registering MapOutputTracker
24/03/22 11:35:16 INFO SparkEnv: Registering BlockManagerMaster
24/03/22 11:35:16 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/03/22 11:35:17 INFO SparkEnv: Registering OutputCommitCoordinator


.master("local[*]").  Nếu chạy local. Dấu * tượng trưng cho ý spark được phép dùng hết tài nguyên của máy ( CPU/RAM ) để xử lý 

.master("yarn"). Nếu chạy trên cluster YARN


Có thể thêm config vào. Một số config để connect với s3

  - .config("spark.jars", "aws-sdk-java-2.17.81.jar") 

  - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
  
  - .config("spark.hadoop.fs.s3a.access.key", "<your_access_key_id>")
  
  - .config("spark.hadoop.fs.s3a.secret.key", "<your_secret_access_key>")

In [4]:
spark

In [5]:
!pyspark --version and spark-shell --version 

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.3.2
      /_/
                        
Using Scala version 2.12.18, OpenJDK 64-Bit Server VM, 11.0.20.1
Branch dataproc-branch-3.3.2
Compiled by user  on 2024-03-15T00:34:11Z
Revision 8a05f8da2bcd58acfc0b8d97000abb2c4a6b8f59
Url https://bigdataoss-internal.googlesource.com/third_party/apache/spark
Type --help for more information.


# Read File

In [6]:
data_storage = "gs://book_crawl/bookcrawl.parquet"

df = spark.read.parquet(data_storage)

                                                                                

In [7]:
df.printSchema()

root
 |-- url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- price_excl_tax: string (nullable = true)
 |-- price_incl_tax: string (nullable = true)
 |-- tax: string (nullable = true)
 |-- availability: string (nullable = true)
 |-- number_of_reviews: long (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- price: string (nullable = true)



In [9]:
df.show()

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------------+--------------+--------------+-----+--------------------+-----------------+-----------------+------+
|                 url|               title|product_type|price_excl_tax|price_incl_tax|  tax|        availability|number_of_reviews|      star_rating| price|
+--------------------+--------------------+------------+--------------+--------------+-----+--------------------+-----------------+-----------------+------+
|https://books.tos...|Scott Pilgrim's P...|       Books|        £52.29|        £52.29|£0.00|In stock (19 avai...|                0| star-rating Five|£52.29|
|https://books.tos...|Libertarianism fo...|       Books|        £51.33|        £51.33|£0.00|In stock (19 avai...|                0|  star-rating Two|£51.33|
|https://books.tos...|A Light in the Attic|       Books|        £51.77|        £51.77|£0.00|In stock (22 avai...|                0|star-rating Three|£51.77|
|https://books.tos...|                Olio|       Books|  

                                                                                

# Read Spark with Schema

In [8]:
from pyspark.sql import types

In [16]:
schema = types.StructType([
    types.StructField('url', types.StringType(), True),
    types.StructField('title', types.StringType(), True),
    types.StructField('product_type', types.StringType(), True),
    types.StructField('price_excl_tax', types.StringType(), True),
    types.StructField('price_incl_tax', types.StringType(), True),
    types.StructField('tax', types.StringType(), True),
    types.StructField('availability', types.StringType(), True),
    types.StructField('number_of_reviews', types.LongType(), True),
    types.StructField('star_rating', types.StringType(), True),
    types.StructField('price', types.StringType(), True)
])

In [17]:
df = spark.read \
    .schema(schema) \
    .parquet(data_storage)

In [18]:
df.show()

[Stage 5:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------------+--------------+--------------+-----+--------------------+-----------------+-----------------+------+
|                 url|               title|product_type|price_excl_tax|price_incl_tax|  tax|        availability|number_of_reviews|      star_rating| price|
+--------------------+--------------------+------------+--------------+--------------+-----+--------------------+-----------------+-----------------+------+
|https://books.tos...|Scott Pilgrim's P...|       Books|        £52.29|        £52.29|£0.00|In stock (19 avai...|                0| star-rating Five|£52.29|
|https://books.tos...|Libertarianism fo...|       Books|        £51.33|        £51.33|£0.00|In stock (19 avai...|                0|  star-rating Two|£51.33|
|https://books.tos...|A Light in the Attic|       Books|        £51.77|        £51.77|£0.00|In stock (22 avai...|                0|star-rating Three|£51.77|
|https://books.tos...|                Olio|       Books|  

                                                                                

# Làm việc với cột ( columns ) 

In [19]:
df = df.withColumn('star_rating', df['star_rating'].cast(types.FloatType())) # Transform Col 
df = df.withColumnRenamed("url","book_url")

In [20]:
df.select("star_rating","number_of_reviews").describe().show()

[Stage 6:>                                                          (0 + 1) / 1]

+-------+-----------+-----------------+
|summary|star_rating|number_of_reviews|
+-------+-----------+-----------------+
|  count|          0|             1000|
|   mean|       null|              0.0|
| stddev|       null|              0.0|
|    min|       null|                0|
|    max|       null|                0|
+-------+-----------+-----------------+



                                                                                

In [21]:
df.printSchema()

root
 |-- book_url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- price_excl_tax: string (nullable = true)
 |-- price_incl_tax: string (nullable = true)
 |-- tax: string (nullable = true)
 |-- availability: string (nullable = true)
 |-- number_of_reviews: long (nullable = true)
 |-- star_rating: float (nullable = true)
 |-- price: string (nullable = true)



# Pyspark và SQL

### SELECT

In [22]:
df.select("*").show()

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------------+--------------+--------------+-----+--------------------+-----------------+-----------+------+
|            book_url|               title|product_type|price_excl_tax|price_incl_tax|  tax|        availability|number_of_reviews|star_rating| price|
+--------------------+--------------------+------------+--------------+--------------+-----+--------------------+-----------------+-----------+------+
|https://books.tos...|Scott Pilgrim's P...|       Books|        £52.29|        £52.29|£0.00|In stock (19 avai...|                0|       null|£52.29|
|https://books.tos...|Libertarianism fo...|       Books|        £51.33|        £51.33|£0.00|In stock (19 avai...|                0|       null|£51.33|
|https://books.tos...|A Light in the Attic|       Books|        £51.77|        £51.77|£0.00|In stock (22 avai...|                0|       null|£51.77|
|https://books.tos...|                Olio|       Books|        £23.88|        £23.88|£0.00|In

                                                                                

In [23]:
df.select(
    "title",
    "availability",
    "price").show()

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------+
|               title|        availability| price|
+--------------------+--------------------+------+
|Scott Pilgrim's P...|In stock (19 avai...|£52.29|
|Libertarianism fo...|In stock (19 avai...|£51.33|
|A Light in the Attic|In stock (22 avai...|£51.77|
|                Olio|In stock (19 avai...|£23.88|
|It's Only the Him...|In stock (19 avai...|£45.17|
|Our Band Could Be...|In stock (19 avai...|£57.25|
|Mesaerion: The Be...|In stock (19 avai...|£37.59|
|         Set Me Free|In stock (19 avai...|£17.46|
|Shakespeare's Son...|In stock (19 avai...|£20.66|
|Rip it Up and Sta...|In stock (19 avai...|£35.02|
|Starving Hearts (...|In stock (19 avai...|£13.99|
|     The Black Maria|In stock (19 avai...|£52.15|
|Maude (1883-1993)...|In stock (18 avai...|£18.02|
|The Dirty Little ...|In stock (19 avai...|£33.34|
|You can't bury th...|In stock (17 avai...|£33.63|
|     The Requiem Red|In stock (19 avai...|£22.65|
|         Penny Maybe|In stock 

                                                                                

In [24]:
df.select("book_url").distinct().show()

[Stage 11:>                                                         (0 + 1) / 1]

+--------------------+
|            book_url|
+--------------------+
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
|https://books.tos...|
+--------------------+
only showing top 20 rows



                                                                                

In [25]:
from pyspark.sql import functions as F

In [26]:
df.select(
    F.col("book_url"),
    F.col("title"),
).distinct().show()

[Stage 14:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|            book_url|               title|
+--------------------+--------------------+
|https://books.tos...|Mesaerion: The Be...|
|https://books.tos...|Lowriders to the ...|
|https://books.tos...|Seven Brief Lesso...|
|https://books.tos...|This One Moment (...|
|https://books.tos...|Throne of Glass (...|
|https://books.tos...|A Flight of Arrow...|
|https://books.tos...|Deliciously Ella ...|
|https://books.tos...|Logan Kade (Falle...|
|https://books.tos...|The Road to Littl...|
|https://books.tos...|Miracles from Hea...|
|https://books.tos...|Run, Spot, Run: T...|
|https://books.tos...|God Is Not Great:...|
|https://books.tos...|The Light of the ...|
|https://books.tos...|Shadow Rites (Jan...|
|https://books.tos...|Hawkeye, Vol. 1: ...|
|https://books.tos...|The Shadow Hero (...|
|https://books.tos...|Far From True (Pr...|
|https://books.tos...|The Hitchhiker's ...|
|https://books.tos...|Red: The True Sto...|
|https://books.tos...|A Heartbre

                                                                                

### WHERE

In [29]:
df.select(
    F.col("book_url"),
    F.col("title"),
    F.col("product_type")
).filter(F.col("product_type") == "Books").show()

[Stage 17:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------+
|            book_url|               title|product_type|
+--------------------+--------------------+------------+
|https://books.tos...|Scott Pilgrim's P...|       Books|
|https://books.tos...|Libertarianism fo...|       Books|
|https://books.tos...|A Light in the Attic|       Books|
|https://books.tos...|                Olio|       Books|
|https://books.tos...|It's Only the Him...|       Books|
|https://books.tos...|Our Band Could Be...|       Books|
|https://books.tos...|Mesaerion: The Be...|       Books|
|https://books.tos...|         Set Me Free|       Books|
|https://books.tos...|Shakespeare's Son...|       Books|
|https://books.tos...|Rip it Up and Sta...|       Books|
|https://books.tos...|Starving Hearts (...|       Books|
|https://books.tos...|     The Black Maria|       Books|
|https://books.tos...|Maude (1883-1993)...|       Books|
|https://books.tos...|The Dirty Little ...|       Books|
|https://books.tos...|You can't

                                                                                

###  Limit 

In [30]:
( 
    df.select(
    F.col("book_url"),
    F.col("title"),
    F.col("product_type")
).filter(F.col("product_type") == "Books")
        .limit(1)
        .show()
)

[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------+
|            book_url|               title|product_type|
+--------------------+--------------------+------------+
|https://books.tos...|Scott Pilgrim's P...|       Books|
+--------------------+--------------------+------------+



                                                                                

### GROUP BY 

In [32]:
( 
    df.select(
            F.col("book_url"),
            F.col("title"),
            F.col("price")
        )
        .groupBy("price")
        .mean()
        .show()
)

[Stage 19:>                                                         (0 + 1) / 1]

+------+
| price|
+------+
|£32.42|
|£11.64|
|£28.03|
|£40.45|
|£10.79|
|£54.23|
|£15.97|
|£21.96|
|£45.21|
|£44.14|
|£58.14|
|£31.85|
|£13.86|
|£10.27|
|£31.95|
|£42.96|
|£56.06|
|£38.21|
|£18.46|
|£47.13|
+------+
only showing top 20 rows



                                                                                

# User Define Function ( UDF )

In [33]:
def convert_case(string):
    return string.upper()

convert_case_udf = F.udf(convert_case, returnType=types.StringType())

In [34]:
( 
    df
    .withColumn('title_upcase', convert_case_udf(F.col("title")))
    .select('title_upcase','title') 
    .show()
)

[Stage 22:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|        title_upcase|               title|
+--------------------+--------------------+
|SCOTT PILGRIM'S P...|Scott Pilgrim's P...|
|LIBERTARIANISM FO...|Libertarianism fo...|
|A LIGHT IN THE ATTIC|A Light in the Attic|
|                OLIO|                Olio|
|IT'S ONLY THE HIM...|It's Only the Him...|
|OUR BAND COULD BE...|Our Band Could Be...|
|MESAERION: THE BE...|Mesaerion: The Be...|
|         SET ME FREE|         Set Me Free|
|SHAKESPEARE'S SON...|Shakespeare's Son...|
|RIP IT UP AND STA...|Rip it Up and Sta...|
|STARVING HEARTS (...|Starving Hearts (...|
|     THE BLACK MARIA|     The Black Maria|
|MAUDE (1883-1993)...|Maude (1883-1993)...|
|THE DIRTY LITTLE ...|The Dirty Little ...|
|YOU CAN'T BURY TH...|You can't bury th...|
|     THE REQUIEM RED|     The Requiem Red|
|         PENNY MAYBE|         Penny Maybe|
| BEHIND CLOSED DOORS| Behind Closed Doors|
|THE BEAR AND THE ...|The Bear and the ...|
|      SOPHIE'S WORLD|      Soph

                                                                                

# Spark SQL

In [35]:
df.createOrReplaceTempView("df")
# df.registerTempTable("df")

In [36]:
spark.sql(" SELECT * from df ").show()

[Stage 23:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------+--------------+--------------+-----+--------------------+-----------------+-----------+------+
|            book_url|               title|product_type|price_excl_tax|price_incl_tax|  tax|        availability|number_of_reviews|star_rating| price|
+--------------------+--------------------+------------+--------------+--------------+-----+--------------------+-----------------+-----------+------+
|https://books.tos...|Scott Pilgrim's P...|       Books|        £52.29|        £52.29|£0.00|In stock (19 avai...|                0|       null|£52.29|
|https://books.tos...|Libertarianism fo...|       Books|        £51.33|        £51.33|£0.00|In stock (19 avai...|                0|       null|£51.33|
|https://books.tos...|A Light in the Attic|       Books|        £51.77|        £51.77|£0.00|In stock (22 avai...|                0|       null|£51.77|
|https://books.tos...|                Olio|       Books|        £23.88|        £23.88|£0.00|In

                                                                                

In [37]:
query = """

SELECT book_url, title
FROM df 
LIMIT 10

""" 
spark.sql(query).show()

[Stage 24:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|            book_url|               title|
+--------------------+--------------------+
|https://books.tos...|Scott Pilgrim's P...|
|https://books.tos...|Libertarianism fo...|
|https://books.tos...|A Light in the Attic|
|https://books.tos...|                Olio|
|https://books.tos...|It's Only the Him...|
|https://books.tos...|Our Band Could Be...|
|https://books.tos...|Mesaerion: The Be...|
|https://books.tos...|         Set Me Free|
|https://books.tos...|Shakespeare's Son...|
|https://books.tos...|Rip it Up and Sta...|
+--------------------+--------------------+



                                                                                

# Ghi kết quả ra file parquet 

In [39]:
query = """

SELECT book_url, title
FROM df 
LIMIT 10


""" 

spark.sql(query).write.parquet("gs://book_crawl/report-000.parquet",
                               mode = "overwrite")

#Dùng  write.partitionBy(col).parquet để partition

                                                                                

In [41]:
result_df = spark.read.parquet("gs://book_crawl/report-000.parquet")

In [42]:
result_df.show()

[Stage 29:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|            book_url|               title|
+--------------------+--------------------+
|https://books.tos...|Scott Pilgrim's P...|
|https://books.tos...|Libertarianism fo...|
|https://books.tos...|A Light in the Attic|
|https://books.tos...|                Olio|
|https://books.tos...|It's Only the Him...|
|https://books.tos...|Our Band Could Be...|
|https://books.tos...|Mesaerion: The Be...|
|https://books.tos...|         Set Me Free|
|https://books.tos...|Shakespeare's Son...|
|https://books.tos...|Rip it Up and Sta...|
+--------------------+--------------------+



                                                                                

# Ví dụ thực tế.

Đếm các comment tốt cho theo sản phẩm + mỗi ngày.

Tiêu chí tốt: 

    - star_rating >= 4 

Tiêu chí lọc comment 

    - helpful_votes >= 10


In [129]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types

spark = ( 
    SparkSession.builder
            .master("yarn") 
            .appName('Aggreate Good Comment')
            .getOrCreate()
)

schema = types.StructType([
    types.StructField('marketplace', types.StringType(), True),
    types.StructField('customer_id', types.StringType(), True),
    types.StructField('review_id', types.StringType(), True),
    types.StructField('product_id', types.StringType(), True),
    types.StructField('product_parent', types.StringType(), True),
    types.StructField('product_category', types.StringType(), True),
    types.StructField('star_rating', types.IntegerType(), True),
    types.StructField('helpful_votes', types.IntegerType(), True),
    types.StructField('total_votes', types.IntegerType(), True),
    types.StructField('vine', types.LongType(), True),
    types.StructField('verified_purchase', types.LongType(), True),
    types.StructField('review_headline', types.StringType(), True),
    types.StructField('review_body', types.StringType(), True),
    types.StructField('review_date', types.StringType(), True)
])

data_storage = "gs://aws-review-data/read/amazon_us_reviews-train-00000-of-00005.parquet"
data_write = "gs://aws-review-data/write/report-count"

df = spark.read.parquet(data_storage)
df = df.withColumn('review_date', F.to_date(F.col("review_date")))

( 
    df.select(
            F.col("product_id"),
            F.col("review_date"),
            F.col("star_rating"),
            F.col("helpful_votes")
        )
        .where((F.col("star_rating") >= 4 ) & (F.col("helpful_votes") >= 10 ))
        .groupBy(["product_id","review_date"])
        .count()
        .write.parquet(data_write,mode = "overwrite")
)


# df.createOrReplaceTempView("df")
query = """

SELECT product_id,review_date,COUNT(star_rating)
FROM df 
WHERE star_rating >=4 and helpful_votes >= 10
GROUP BY product_id
""" 

spark.sql(query).write.parquet(data_write,
                        mode = "overwrite")

                                                                                

In [133]:
result_df = spark.read.parquet("gs://aws-review-data/write/report-count")

In [136]:
result_df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- review_date: date (nullable = true)
 |-- count: long (nullable = true)



In [138]:
result_df.orderBy(F.desc("count")).show()

+----------+-----------+-----+
|product_id|review_date|count|
+----------+-----------+-----+
|B00W0R8FYE| 2015-04-13|    7|
|B00KRBY8YQ| 2014-06-10|    5|
|B00JJORFVK| 2014-05-14|    4|
|B00K2WN38M| 2014-07-02|    4|
|B00NLXYCUW| 2014-11-27|    2|
|B00PUP46UU| 2015-02-19|    2|
|B00N7TIIKK| 2014-11-01|    2|
|B00NLMZ86A| 2014-09-18|    2|
|B00RNBXZQU| 2015-03-26|    2|
|B00P0DQ0EW| 2015-05-04|    2|
|B00Q6SJCGI| 2014-11-29|    2|
|B00RMFFH3Q| 2015-01-29|    2|
|B00UID20P2| 2015-06-19|    2|
|B00UFMFADA| 2015-05-14|    2|
|B00RVUQJNO| 2015-07-17|    2|
|B00XK273PK| 2015-08-18|    2|
|B00WXPCWY2| 2015-06-01|    2|
|B00OE80FIG| 2014-10-17|    2|
|B00NOFDSSO| 2015-03-16|    2|
|B011MAHVY4| 2015-08-12|    2|
+----------+-----------+-----+
only showing top 20 rows



# RDD

In [139]:
df.select( 
    F.col('review_headline'),
    F.col('review_body')
).limit(10).show()

+--------------------+--------------------+
|     review_headline|         review_body|
+--------------------+--------------------+
|★ THESE REALLY DO...|These Really Do W...|
|Favorite for wint...|I love this dress...|
|Great Socks for t...|Nice socks, great...|
|          Slick hat!|I bought this for...|
|I would do it again!|Perfect dress and...|
|          Five Stars|Excellent for my ...|
|            Love it!|Raw is the only w...|
|         Three Stars|        A bit large.|
|          Five Stars|          Great fit!|
|    Not my favorite.|Shirt a bit too l...|
+--------------------+--------------------+




[Stage 101:>                                                        (0 + 1) / 1]

                                                                                

In [146]:
rdd_review_body = df.select("product_id","review_body").rdd

In [147]:
rdd_review_body

MapPartitionsRDD[359] at javaToPython at NativeMethodAccessorImpl.java:0

In [148]:
rdd_review_body.take(1)


[Stage 104:>                                                        (0 + 1) / 1]

                                                                                

[Row(product_id='B01KL6O72Y', review_body="These Really Do Work Great, But You Do Need To Know a Few Things.  I've Been Using Mine For a Few Years Now.  First, I Paid a Few Dollars Less For Mine and The Price Has Jumped.  They're All Imported, so Try to Find Either a Cheaper One or One That's Extremely Well Made.  This One is Made Well Enough, If You're Careful.  This Thing Can Cut You, So Don't Let Some Kid Use It, &#34;Because It Looks Like Fun!!&#34;  You Need A Pineapple That's Big Enough.  I Can't Tell You How Many Times I Went to Wal-Mart or the Grocery Store and Their Pineapples were Just Too Small of a Diameter.  It HAS to Be Big Enough.  It's Better To Have Some Waste on The Inside Of The Husk.<br /><br />When I'm Finished Using The Pineapple Corer, Then I Cut Up the Husk Of The Pineapple To Get The Rest Of The Pineapple Cut Up and I Save The Core of the Pineapple To Go Into My Iced Tea Pitcher or Drink Pitcher (That's How They Do It In Hawaii)  When I Was In Hawaii, They Neve

In [162]:
all_data = rdd_review_body.collect()

                                                                                

In [153]:
rdd_review_body.getNumPartitions()

2

In [151]:
rdd_review_body.repartition(4).getNumPartitions()

4

In [154]:
rdd_review_body.getNumPartitions()

2

### Map

In [159]:
def count_len(row):
    review = row.review_body
    product_id = row.product_id
    return (product_id,len(review))
        
rdd_review_body.map(count_len).take(10)

[('B01KL6O72Y', 2911),
 ('B01ID3ZS5W', 371),
 ('B01I497BGY', 87),
 ('B01HDXFZK6', 183),
 ('B01G6MBEBY', 51),
 ('B01FWRXN0Y', 48),
 ('B01EXNH1HE', 97),
 ('B01E7OL09O', 12),
 ('B01DXHX81O', 10),
 ('B01DDULIJK', 197)]

In [145]:
def count_word(row):
    review = row.review_body
    for word in review.split(" "):
        yield (word,1)
        
rdd_review_body.flatMap(count_word).take(10)

[('These', 1),
 ('Really', 1),
 ('Do', 1),
 ('Work', 1),
 ('Great,', 1),
 ('But', 1),
 ('You', 1),
 ('Do', 1),
 ('Need', 1),
 ('To', 1)]

In [161]:
def filter_product(row):
    product_id = row.product_id
    return product_id != "B01KL6O72Y"
        
( 
    rdd_review_body
        .filter(filter_product)
        .map(count_len)
        .take(10)
)

[('B01ID3ZS5W', 371),
 ('B01I497BGY', 87),
 ('B01HDXFZK6', 183),
 ('B01G6MBEBY', 51),
 ('B01FWRXN0Y', 48),
 ('B01EXNH1HE', 97),
 ('B01E7OL09O', 12),
 ('B01DXHX81O', 10),
 ('B01DDULIJK', 197),
 ('B01BOKOL4A', 129)]

In [164]:
def sort_by_value(record):
    return record[1]

(
    rdd_review_body
        .filter(filter_product)
        .map(count_len)
        .sortBy(sort_by_value,ascending = False)
        .take(10)
)


                                                                                

[('B00WDZ81JC', 16332),
 ('B00O7AS0MY', 14273),
 ('B00VZTJPRY', 12340),
 ('B00XHRL53O', 11895),
 ('B00OQXJXY6', 11769),
 ('B00JF0KBBO', 8038),
 ('B00JAN2N2G', 7719),
 ('B00TF7GW5Q', 7458),
 ('B00NR1Y30C', 7454),
 ('B011SQ0ERI', 7239)]

### Reduce

In [172]:
def my_sum(x,y):
    return x + y

In [168]:
(
    rdd_review_body
        .map(count_len)
        .reduceByKey(my_sum)
        .sortBy(sort_by_value,ascending = False)
        .take(10)
)

                                                                                

[('B00LMI9A6Y', 181960),
 ('B00ORZIYBQ', 177020),
 ('B00K5AFQ22', 148052),
 ('B00JSJHQP6', 91681),
 ('B00N9OPF3Q', 63939),
 ('B00N9OPMMU', 59295),
 ('B00LDUSX78', 54388),
 ('B00MAVN0R2', 50601),
 ('B00MWH3RNG', 50072),
 ('B00LLIVQNU', 45985)]

In [181]:
from collections import namedtuple
ProductWordCountRow = namedtuple("ProductWordCountRow",["product_id","len_count"])

def return_to_row(record):
    return ProductWordCountRow(record[0],record[1])

df = (
    rdd_review_body
        .map(count_len)
        .reduceByKey(my_sum)
        .map(return_to_row)
        .toDF()
)

                                                                                

In [182]:
df.show()

+----------+---------+
|product_id|len_count|
+----------+---------+
|B01I497BGY|       87|
|B01G6MBEBY|       51|
|B01FWRXN0Y|       48|
|B01DXHX81O|       10|
|B01DDULIJK|      197|
|B01B3Q4Q0O|       93|
|B01ADDSL9U|       10|
|B019MDXIXG|       10|
|B019438FEG|       59|
|B0178HGNIA|       62|
|B016VIU0QI|      366|
|B016PUU3VO|      231|
|B016AQNDM4|       19|
|B01694YS8K|      135|
|B015YCHLHS|       83|
|B014WCV7JY|       21|
|B014PKNCGE|       38|
|B014MSSP66|       40|
|B014L79H8I|      461|
|B014K3PHXW|       33|
+----------+---------+
only showing top 20 rows



# World Count  bằng rdd

In [184]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

def count_word(row):
    review = row.review_body
    for word in review.split(" "):
        yield (word,1)

def my_sum(x,y):
    return x + y

spark = ( 
    SparkSession.builder
            .master("yarn") 
            .appName('Aggreate Good Comment')
            .getOrCreate()
)

schema = types.StructType([
    types.StructField('marketplace', types.StringType(), True),
    types.StructField('customer_id', types.StringType(), True),
    types.StructField('review_id', types.StringType(), True),
    types.StructField('product_id', types.StringType(), True),
    types.StructField('product_parent', types.StringType(), True),
    types.StructField('product_category', types.StringType(), True),
    types.StructField('star_rating', types.IntegerType(), True),
    types.StructField('helpful_votes', types.IntegerType(), True),
    types.StructField('total_votes', types.IntegerType(), True),
    types.StructField('vine', types.LongType(), True),
    types.StructField('verified_purchase', types.LongType(), True),
    types.StructField('review_headline', types.StringType(), True),
    types.StructField('review_body', types.StringType(), True),
    types.StructField('review_date', types.StringType(), True)
])

data_storage = "gs://aws-review-data/amazon_us_reviews-train-00000-of-00005.parquet"
data_write = "gs://aws-review-data/write/world-count"

df = spark.read.schema(schema).parquet(data_storage)
rdd_review_body = df.select("review_body").rdd
(
    rdd_review_body
        .flatMap(count_word)
        .reduceByKey(my_sum)
        .saveAsTextFile(data_write)
)


                                                                                