# Init Spark

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = ( 
    SparkSession.builder
            .appName('test').master("yarn")
            .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/19 14:27:14 INFO SparkEnv: Registering MapOutputTracker
24/03/19 14:27:14 INFO SparkEnv: Registering BlockManagerMaster
24/03/19 14:27:14 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/03/19 14:27:14 INFO SparkEnv: Registering OutputCommitCoordinator


.master("local[*]").  Nếu chạy local. Dấu * tượng trưng cho ý spark được phép dùng hết tài nguyên của máy ( CPU/RAM ) để xử lý 

.master("yarn"). Nếu chạy trên cluster YARN


Có thể thêm config vào. Một số config để connect với s3

  - .config("spark.jars", "aws-sdk-java-2.17.81.jar") 

  - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
  
  - .config("spark.hadoop.fs.s3a.access.key", "<your_access_key_id>")
  
  - .config("spark.hadoop.fs.s3a.secret.key", "<your_secret_access_key>")

In [3]:
spark

In [4]:
!pyspark --version and spark-shell --version 

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.3.2
      /_/
                        
Using Scala version 2.12.18, OpenJDK 64-Bit Server VM, 11.0.20.1
Branch dataproc-branch-3.3.2
Compiled by user  on 2024-03-05T22:27:36Z
Revision 8a05f8da2bcd58acfc0b8d97000abb2c4a6b8f59
Url https://bigdataoss-internal.googlesource.com/third_party/apache/spark
Type --help for more information.


# Read File

In [5]:
data_storage = "gs://course_crawl/2024/03/19/LeVanDuy_First.parquet"

df = spark.read.parquet(data_storage)

                                                                                

In [6]:
df.printSchema()

root
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- language: string (nullable = true)
 |-- description: string (nullable = true)
 |-- instructors: string (nullable = true)
 |-- learns: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- level: string (nullable = true)
 |-- time: string (nullable = true)



In [7]:
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+------------------+--------------------+
|                 url|                name|         language|         description|         instructors|              learns|             level|                time|
+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+------------------+--------------------+
|https://www.cours...|Google Cybersecur...|Taught in English|This is your path...|Google Career Cer...|[Understand the i...|    Beginner level|6 months at 7 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|[Learn the skills...|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|[Learn the skills...|Intermediate level|1 month at 10 hou...|
|https://w

                                                                                

# Read Spark with Schema

In [8]:
from pyspark.sql import types

In [9]:
schema = types.StructType([
    types.StructField('url', types.StringType(), True),
    types.StructField('name', types.StringType(), True),
    types.StructField('language', types.StringType(), True),
    types.StructField('description', types.StringType(), True),
    types.StructField('instructors', types.StringType(), True),
    types.StructField('learn', types.StringType(), True),
    types.StructField('level', types.StringType(), True),
    types.StructField('time', types.StringType(), True)
])

In [10]:
df = spark.read \
    .schema(schema) \
    .parquet(data_storage)

In [11]:
df.show()

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-----------------+--------------------+--------------------+-----+------------------+--------------------+
|                 url|                name|         language|         description|         instructors|learn|             level|                time|
+--------------------+--------------------+-----------------+--------------------+--------------------+-----+------------------+--------------------+
|https://www.cours...|Google Cybersecur...|Taught in English|This is your path...|Google Career Cer...| null|    Beginner level|6 months at 7 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...| null|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...| null|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai.

                                                                                

# Làm việc với cột ( columns ) 

In [12]:
df = df.withColumnRenamed("url", "url_course")

In [13]:
df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-----------------+--------------------+--------------------+-----+------------------+--------------------+
|          url_course|                name|         language|         description|         instructors|learn|             level|                time|
+--------------------+--------------------+-----------------+--------------------+--------------------+-----+------------------+--------------------+
|https://www.cours...|Google Cybersecur...|Taught in English|This is your path...|Google Career Cer...| null|    Beginner level|6 months at 7 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...| null|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...| null|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai.

                                                                                

In [14]:
df.select("language","level").describe().show()

                                                                                

+-------+-----------------+------------------+
|summary|         language|             level|
+-------+-----------------+------------------+
|  count|               34|                34|
|   mean|             null|              null|
| stddev|             null|              null|
|    min|Taught in English|    Beginner level|
|    max|Taught in English|Intermediate level|
+-------+-----------------+------------------+



In [15]:
df.printSchema()

root
 |-- url_course: string (nullable = true)
 |-- name: string (nullable = true)
 |-- language: string (nullable = true)
 |-- description: string (nullable = true)
 |-- instructors: string (nullable = true)
 |-- learn: string (nullable = true)
 |-- level: string (nullable = true)
 |-- time: string (nullable = true)



# Pyspark và SQL

### SELECT

In [16]:
df.select("*").show()

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-----------------+--------------------+--------------------+-----+------------------+--------------------+
|          url_course|                name|         language|         description|         instructors|learn|             level|                time|
+--------------------+--------------------+-----------------+--------------------+--------------------+-----+------------------+--------------------+
|https://www.cours...|Google Cybersecur...|Taught in English|This is your path...|Google Career Cer...| null|    Beginner level|6 months at 7 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...| null|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...| null|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai.

                                                                                

In [17]:
df.select("*").head()

                                                                                

Row(url_course='https://www.coursera.org/professional-certificates/google-cybersecurity', name='Google Cybersecurity Professional Certificate', language='Taught in English', description='This is your path to a career in cybersecurity. In this certificate program, you’ll learn in-demand skills that can have you job-ready in less than 6 months. No degree or experience required. ', instructors='Google Career Certificates', learn=None, level='Beginner level', time='6 months at 7 hours a week')

In [18]:
df.select(
    "name",
    "language",
    "description",
    "instructors",
    "level",
    "time").show()

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+------------------+--------------------+
|                name|         language|         description|         instructors|             level|                time|
+--------------------+-----------------+--------------------+--------------------+------------------+--------------------+
|Google Cybersecur...|Taught in English|This is your path...|Google Career Cer...|    Beginner level|6 months at 7 hou...|
|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|Intermediate level|1 month at 10 hou...|
|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|Intermediate level|1 month at 10 hou...|
|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|Intermediate level|1 month at 10 hou...|
|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|    Beginner level|1 month at 10 hou...|
|Preparing for G

                                                                                

In [19]:
df.select("instructors").distinct().show()

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+
|         instructors|
+--------------------+
|Taught by Meta Staff|
|     Leslie Reynolds|
|IBM Skills Networ...|
|Google Cloud Trai...|
|           Microsoft|
|       Morgan Willis|
|      Shadow Farrell|
|        John Rofrano|
|Google Career Cer...|
|        James Dalton|
+--------------------+



                                                                                

In [20]:
from pyspark.sql import functions as F

In [21]:
df.select(
    F.col("instructors"),
    F.col("level"),
).distinct().show()

[Stage 13:>                                                         (0 + 1) / 1]

+--------------------+------------------+
|         instructors|             level|
+--------------------+------------------+
|           Microsoft|    Beginner level|
|     Leslie Reynolds|    Beginner level|
|Taught by Meta Staff|Intermediate level|
|       Morgan Willis|Intermediate level|
|      Shadow Farrell|    Beginner level|
|       Morgan Willis|    Beginner level|
|        John Rofrano|    Beginner level|
|Taught by Meta Staff|    Beginner level|
|Google Cloud Trai...|Intermediate level|
|Google Career Cer...|    Beginner level|
|        James Dalton|    Beginner level|
|        John Rofrano|Intermediate level|
|Google Cloud Trai...|    Beginner level|
|IBM Skills Networ...|    Beginner level|
+--------------------+------------------+



                                                                                

### WHERE

In [22]:
df.select(
    F.col("name"),
    F.col("language"),
    F.col("description"),
    F.col("instructors"),
    F.col("level"),
    F.col("time")
).filter(F.col("instructors").contains("Taught by Meta Staff")).show()

[Stage 16:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+------------------+--------------------+
|                name|         language|         description|         instructors|             level|                time|
+--------------------+-----------------+--------------------+--------------------+------------------+--------------------+
|Meta AR Developer...|Taught in English|Prepare for a car...|Taught by Meta Staff|Intermediate level|3 months at 10 ho...|
|Meta Database Eng...|Taught in English|Launch your caree...|Taught by Meta Staff|    Beginner level|6 months at 6 hou...|
|Meta iOS Develope...|Taught in English|Launch your caree...|Taught by Meta Staff|    Beginner level|8 months at 7 hou...|
|Meta Android Deve...|Taught in English|Launch your caree...|Taught by Meta Staff|    Beginner level|8 months at 7 hou...|
|Meta Back-End Dev...|Taught in English|Launch your caree...|Taught by Meta Staff|    Beginner level|8 months at 6 hou...|
|Meta Front-End 

                                                                                

In [23]:
df.select(
    F.col("name"),
    F.col("language"),
    F.col("description"),
    F.col("instructors"),
    F.col("level"),
    F.col("time")
).filter(F.col("level").contains("Beginner level")).show()

[Stage 17:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|                name|         language|         description|         instructors|         level|                time|
+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|Google Cybersecur...|Taught in English|This is your path...|Google Career Cer...|Beginner level|6 months at 7 hou...|
|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|Beginner level|1 month at 10 hou...|
|Google IT Automat...|Taught in English|Learn in-demand s...|Google Career Cer...|Beginner level|6 months at 10 ho...|
|Akamai Customer C...|Taught in English|Start Your Career...|      Shadow Farrell|Beginner level|6 months at 8 hou...|
|Palo Alto Network...|Taught in English|Palo Alto Network...|        James Dalton|Beginner level|1 month at 10 hou...|
|Akamai Network En...|Taught in English|Launch Y

                                                                                

In [24]:
condition = (F.col("time").contains("3 months at 10 hours a week")) & (F.col("level").contains("Beginner level"))

df.select(
    F.col("name"),
    F.col("language"),
    F.col("description"),
    F.col("instructors"),
    F.col("level"),
    F.col("time")
).filter(condition).show()

[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|                name|         language|         description|         instructors|         level|                time|
+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|IBM Front-End Dev...|Taught in English|Prepare for a car...|IBM Skills Networ...|Beginner level|3 months at 10 ho...|
|IBM IT Support  P...|Taught in English|Prepare for a car...|IBM Skills Networ...|Beginner level|3 months at 10 ho...|
| IBM and ISC2 Cyb...|Taught in English|Launch your caree...|IBM Skills Networ...|Beginner level|3 months at 10 ho...|
|IBM DevOps and So...|Taught in English|Launch your DevOp...|        John Rofrano|Beginner level|3 months at 10 ho...|
+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+



                                                                                

### ORDER BY

In [25]:
df.select(
    F.col("name"),
    F.col("language"),
    F.col("description"),
    F.col("instructors"),
    F.col("level"),
    F.col("time")
).filter(F.col("level").contains("Beginner level")).orderBy("name").show()

[Stage 19:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|                name|         language|         description|         instructors|         level|                time|
+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
| IBM and ISC2 Cyb...|Taught in English|Launch your caree...|IBM Skills Networ...|Beginner level|3 months at 10 ho...|
|AWS Cloud Technol...|Taught in English|Start a career as...|       Morgan Willis|Beginner level|4 months at 10 ho...|
|Akamai Customer C...|Taught in English|Start Your Career...|      Shadow Farrell|Beginner level|6 months at 8 hou...|
|Akamai Network En...|Taught in English|Launch Your Caree...|      Shadow Farrell|Beginner level|5 months at 8 hou...|
|Google Cybersecur...|Taught in English|This is your path...|Google Career Cer...|Beginner level|6 months at 7 hou...|
|Google IT Automat...|Taught in English|Learn in

                                                                                

In [26]:
df.select(
    F.col("name"),
    F.col("language"),
    F.col("description"),
    F.col("instructors"),
    F.col("level"),
    F.col("time")
).filter(F.col("level").contains("Beginner level")).sort("time").show()

[Stage 20:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|                name|         language|         description|         instructors|         level|                time|
+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|Beginner level|1 month at 10 hou...|
|Palo Alto Network...|Taught in English|Palo Alto Network...|        James Dalton|Beginner level|1 month at 10 hou...|
|IBM Back-End Deve...|Taught in English|Prepare for a car...|        John Rofrano|Beginner level|12 months at 10 h...|
| IBM and ISC2 Cyb...|Taught in English|Launch your caree...|IBM Skills Networ...|Beginner level|3 months at 10 ho...|
|IBM DevOps and So...|Taught in English|Launch your DevOp...|        John Rofrano|Beginner level|3 months at 10 ho...|
|IBM Front-End Dev...|Taught in English|Prepare 

                                                                                

In [27]:
df.select(
    F.col("name"),
    F.col("language"),
    F.col("description"),
    F.col("instructors"),
    F.col("level"),
    F.col("time")
).filter(F.col("level").contains("Beginner level")).sort(F.col("time").desc()).show()

[Stage 21:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|                name|         language|         description|         instructors|         level|                time|
+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|Meta iOS Develope...|Taught in English|Launch your caree...|Taught by Meta Staff|Beginner level|8 months at 7 hou...|
|Meta Android Deve...|Taught in English|Launch your caree...|Taught by Meta Staff|Beginner level|8 months at 7 hou...|
|Meta Back-End Dev...|Taught in English|Launch your caree...|Taught by Meta Staff|Beginner level|8 months at 6 hou...|
|Meta Front-End De...|Taught in English|Launch your caree...|Taught by Meta Staff|Beginner level|7 months at 6 hou...|
|SAP Technology Co...|Taught in English|Unlock your poten...|     Leslie Reynolds|Beginner level|7 months at 3 hou...|
|Akamai Customer C...|Taught in English|Start Yo

                                                                                

In [28]:
(
    df.select(
        F.col("name"),
        F.col("language"),
        F.col("description"),
        F.col("instructors"),
        F.col("level"),
        F.col("time")
    )
    .filter(F.col("level").contains("Beginner level"))
    .sort(F.desc("time"))
    .show()
)

[Stage 22:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|                name|         language|         description|         instructors|         level|                time|
+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|Meta iOS Develope...|Taught in English|Launch your caree...|Taught by Meta Staff|Beginner level|8 months at 7 hou...|
|Meta Android Deve...|Taught in English|Launch your caree...|Taught by Meta Staff|Beginner level|8 months at 7 hou...|
|Meta Back-End Dev...|Taught in English|Launch your caree...|Taught by Meta Staff|Beginner level|8 months at 6 hou...|
|Meta Front-End De...|Taught in English|Launch your caree...|Taught by Meta Staff|Beginner level|7 months at 6 hou...|
|SAP Technology Co...|Taught in English|Unlock your poten...|     Leslie Reynolds|Beginner level|7 months at 3 hou...|
|Akamai Customer C...|Taught in English|Start Yo

                                                                                

###  Limit 

In [29]:
(
    df.select(
        F.col("name"),
        F.col("language"),
        F.col("description"),
        F.col("instructors"),
        F.col("level"),
        F.col("time")
    )
    .filter(F.col("level").contains("Beginner level"))
    .sort(F.col("time"))
    .limit(5)
    .show()
)

[Stage 23:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|                name|         language|         description|         instructors|         level|                time|
+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+
|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...|Beginner level|1 month at 10 hou...|
|Palo Alto Network...|Taught in English|Palo Alto Network...|        James Dalton|Beginner level|1 month at 10 hou...|
|IBM Back-End Deve...|Taught in English|Prepare for a car...|        John Rofrano|Beginner level|12 months at 10 h...|
|IBM IT Support  P...|Taught in English|Prepare for a car...|IBM Skills Networ...|Beginner level|3 months at 10 ho...|
|IBM Front-End Dev...|Taught in English|Prepare for a car...|IBM Skills Networ...|Beginner level|3 months at 10 ho...|
+--------------------+-----------------+--------

                                                                                

### GROUP BY 

In [30]:
 (
    df.select(
        F.col("instructors"),
        F.col("time")
    )
     .groupBy("instructors")
     .count()
     .show()
 )

[Stage 24:>                                                         (0 + 1) / 1]

+--------------------+-----+
|         instructors|count|
+--------------------+-----+
|Taught by Meta Staff|    6|
|     Leslie Reynolds|    1|
|IBM Skills Networ...|    6|
|Google Cloud Trai...|    7|
|           Microsoft|    1|
|       Morgan Willis|    2|
|      Shadow Farrell|    2|
|        John Rofrano|    4|
|Google Career Cer...|    4|
|        James Dalton|    1|
+--------------------+-----+



                                                                                

# User Define Function ( UDF )

In [31]:
def convert_case(string):
    return string.upper()

convert_case_udf = F.udf(convert_case, returnType=types.StringType())

In [32]:
( 
    df
    .withColumn('instructors_uppercase', convert_case_udf(F.col("instructors")))
    .select('instructors_uppercase','instructors') 
    .show()
)

[Stage 27:>                                                         (0 + 1) / 1]

+---------------------+--------------------+
|instructors_uppercase|         instructors|
+---------------------+--------------------+
| GOOGLE CAREER CER...|Google Career Cer...|
| GOOGLE CLOUD TRAI...|Google Cloud Trai...|
| GOOGLE CLOUD TRAI...|Google Cloud Trai...|
| GOOGLE CLOUD TRAI...|Google Cloud Trai...|
| GOOGLE CLOUD TRAI...|Google Cloud Trai...|
| GOOGLE CLOUD TRAI...|Google Cloud Trai...|
| GOOGLE CLOUD TRAI...|Google Cloud Trai...|
| GOOGLE CAREER CER...|Google Career Cer...|
| GOOGLE CLOUD TRAI...|Google Cloud Trai...|
|        MORGAN WILLIS|       Morgan Willis|
|         JOHN ROFRANO|        John Rofrano|
| TAUGHT BY META STAFF|Taught by Meta Staff|
|       SHADOW FARRELL|      Shadow Farrell|
|         JAMES DALTON|        James Dalton|
|       SHADOW FARRELL|      Shadow Farrell|
| IBM SKILLS NETWOR...|IBM Skills Networ...|
|         JOHN ROFRANO|        John Rofrano|
| IBM SKILLS NETWOR...|IBM Skills Networ...|
| IBM SKILLS NETWOR...|IBM Skills Networ...|
| TAUGHT B

                                                                                

# Spark SQL

In [33]:
df.createOrReplaceTempView("df")
# df.registerTempTable("df")

In [34]:
spark.sql(" SELECT * from df ").show()

[Stage 28:>                                                         (0 + 1) / 1]

+--------------------+--------------------+-----------------+--------------------+--------------------+-----+------------------+--------------------+
|          url_course|                name|         language|         description|         instructors|learn|             level|                time|
+--------------------+--------------------+-----------------+--------------------+--------------------+-----+------------------+--------------------+
|https://www.cours...|Google Cybersecur...|Taught in English|This is your path...|Google Career Cer...| null|    Beginner level|6 months at 7 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...| null|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai...| null|Intermediate level|1 month at 10 hou...|
|https://www.cours...|Preparing for Goo...|Taught in English|Advance your care...|Google Cloud Trai.

                                                                                

In [35]:
query = """
SELECT name, language, instructors, description, time
FROM df WHERE level == 'Beginner level'

""" 
spark.sql(query).show()


[Stage 29:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------------+
|                name|         language|         instructors|         description|                time|
+--------------------+-----------------+--------------------+--------------------+--------------------+
|Google Cybersecur...|Taught in English|Google Career Cer...|This is your path...|6 months at 7 hou...|
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Advance your care...|1 month at 10 hou...|
|Google IT Automat...|Taught in English|Google Career Cer...|Learn in-demand s...|6 months at 10 ho...|
|Akamai Customer C...|Taught in English|      Shadow Farrell|Start Your Career...|6 months at 8 hou...|
|Palo Alto Network...|Taught in English|        James Dalton|Palo Alto Network...|1 month at 10 hou...|
|Akamai Network En...|Taught in English|      Shadow Farrell|Launch Your Caree...|5 months at 8 hou...|
|IBM Full-Stack Ja...|Taught in English|IBM Skills Networ...|Lau

                                                                                

# Ghi kết quả ra file parquet 

In [38]:
query = """

SELECT name, language, instructors, description, time
FROM df WHERE level == 'Beginner level'

""" 

spark.sql(query).write.parquet("gs://course_crawl/2024/03/19/LeVanDuy_Last.parquet",
                               mode = "overwrite")

#Dùng  write.partitionBy(col).parquet để partition

                                                                                

In [39]:
result_df = spark.read.parquet("gs://course_crawl/2024/03/19/LeVanDuy_Last.parquet")

                                                                                

In [40]:
result_df.show()

[Stage 33:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------------+
|                name|         language|         instructors|         description|                time|
+--------------------+-----------------+--------------------+--------------------+--------------------+
|Google Cybersecur...|Taught in English|Google Career Cer...|This is your path...|6 months at 7 hou...|
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Advance your care...|1 month at 10 hou...|
|Google IT Automat...|Taught in English|Google Career Cer...|Learn in-demand s...|6 months at 10 ho...|
|Akamai Customer C...|Taught in English|      Shadow Farrell|Start Your Career...|6 months at 8 hou...|
|Palo Alto Network...|Taught in English|        James Dalton|Palo Alto Network...|1 month at 10 hou...|
|Akamai Network En...|Taught in English|      Shadow Farrell|Launch Your Caree...|5 months at 8 hou...|
|IBM Full-Stack Ja...|Taught in English|IBM Skills Networ...|Lau

                                                                                

# Ví dụ thực tế.

In [63]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types

spark = ( 
    SparkSession.builder
            .master("yarn") 
            .appName('Google Courses')
            .getOrCreate()
)

schema = types.StructType([
    types.StructField('url', types.StringType(), True),
    types.StructField('name', types.StringType(), True),
    types.StructField('language', types.StringType(), True),
    types.StructField('description', types.StringType(), True),
    types.StructField('instructors', types.StringType(), True),
    types.StructField('learn', types.StringType(), True),
    types.StructField('level', types.StringType(), True),
    types.StructField('time', types.StringType(), True)
])

data_storage = "gs://course_crawl/2024/03/19/LeVanDuy_First.parquet"
data_write = "gs://course_crawl/2024/03/19/google_courses"

df = spark.read.parquet(data_storage)

df.createOrReplaceTempView("df")
query = """

SELECT name, language, instructors, description, time
FROM df WHERE instructors == 'Google Cloud Training'

""" 

spark.sql(query).write.parquet(data_write,
                        mode = "overwrite")

24/03/19 14:40:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [42]:
result_df = spark.read.parquet("gs://course_crawl/2024/03/19/google_courses")

                                                                                

In [43]:
result_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- language: string (nullable = true)
 |-- instructors: string (nullable = true)
 |-- description: string (nullable = true)
 |-- time: string (nullable = true)



In [44]:
result_df.show()

[Stage 37:>                                                         (0 + 1) / 1]

+--------------------+-----------------+--------------------+--------------------+--------------------+
|                name|         language|         instructors|         description|                time|
+--------------------+-----------------+--------------------+--------------------+--------------------+
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Advance your care...|1 month at 10 hou...|
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Advance your care...|1 month at 10 hou...|
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Advance your care...|1 month at 10 hou...|
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Advance your care...|1 month at 10 hou...|
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Advance your care...|3 months at 10 ho...|
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Start your career...|1 month at 10 hou...|
|Preparing for Goo...|Taught in English|Google Cloud Trai...|Dev

                                                                                

# RDD

In [47]:
df.select( 
    F.col('name'),
    F.col('instructors')
).limit(10).show()

[Stage 39:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|                name|         instructors|
+--------------------+--------------------+
|Google Cybersecur...|Google Career Cer...|
|Preparing for Goo...|Google Cloud Trai...|
|Preparing for Goo...|Google Cloud Trai...|
|Preparing for Goo...|Google Cloud Trai...|
|Preparing for Goo...|Google Cloud Trai...|
|Preparing for Goo...|Google Cloud Trai...|
|Preparing for Goo...|Google Cloud Trai...|
|Google IT Automat...|Google Career Cer...|
|Preparing for Goo...|Google Cloud Trai...|
|AWS Cloud Solutio...|       Morgan Willis|
+--------------------+--------------------+



                                                                                

In [48]:
rdd_course_body = df.select("name","instructors").rdd

In [49]:
rdd_course_body

MapPartitionsRDD[149] at javaToPython at NativeMethodAccessorImpl.java:0

In [50]:
rdd_course_body.take(1)

                                                                                

[Row(name='Google Cybersecurity Professional Certificate', instructors='Google Career Certificates')]

In [51]:
all_data = rdd_course_body.collect()

                                                                                

In [52]:
rdd_course_body.getNumPartitions()

1

In [53]:
rdd_course_body.repartition(4).getNumPartitions()

4

In [54]:
rdd_course_body.getNumPartitions()

1

### Map

In [55]:
def count_len(row):
    name = row.name
    instructors = row.instructors
    return (name, instructors)
        
rdd_course_body.map(count_len).take(10)

                                                                                

[('Google Cybersecurity Professional Certificate',
  'Google Career Certificates'),
 ('Preparing for Google Cloud Certification: Cloud Architect Professional Certificate',
  'Google Cloud Training'),
 ('Preparing for Google Cloud Certification: Cloud Security Engineer Professional Certificate',
  'Google Cloud Training'),
 ('Preparing for Google Cloud Certification: Cloud Network Engineer Professional Certificate',
  'Google Cloud Training'),
 ('Preparing for Google Cloud Certification: Cloud DevOps Engineer Professional Certificate',
  'Google Cloud Training'),
 ('Preparing for Google Cloud Certification: Machine Learning Engineer Professional Certificate',
  'Google Cloud Training'),
 ('Preparing for Google Cloud Certification: Cloud Engineer Professional Certificate',
  'Google Cloud Training'),
 ('Google IT Automation with Python Professional Certificate',
  'Google Career Certificates'),
 ('Preparing for Google Cloud Certification: Cloud Developer Professional Certificate',
  'Goo

In [56]:
def count_word(row):
    name = row.name
    for word in name.split(" "):
        yield (word,1)
        
rdd_course_body.flatMap(count_word).take(10)

                                                                                

[('Google', 1),
 ('Cybersecurity', 1),
 ('Professional', 1),
 ('Certificate', 1),
 ('Preparing', 1),
 ('for', 1),
 ('Google', 1),
 ('Cloud', 1),
 ('Certification:', 1),
 ('Cloud', 1)]

In [57]:
def filter_product(row):
    instructors = row.instructors
    return instructors != "Google Cloud Training"
        
( 
    rdd_course_body
        .filter(filter_product)
        .map(count_len)
        .take(10)
)

                                                                                

[('Google Cybersecurity Professional Certificate',
  'Google Career Certificates'),
 ('Google IT Automation with Python Professional Certificate',
  'Google Career Certificates'),
 ('AWS Cloud Solutions Architect  Professional Certificate', 'Morgan Willis'),
 ('IBM Applied DevOps Engineering Professional Certificate', 'John Rofrano'),
 ('Meta AR Developer  Professional Certificate', 'Taught by Meta Staff'),
 ('Akamai Customer Consulting and Support Professional Certificate',
  'Shadow Farrell'),
 ('Palo Alto Networks Cybersecurity Professional Certificate', 'James Dalton'),
 ('Akamai Network Engineering Professional Certificate', 'Shadow Farrell'),
 ('IBM Full-Stack JavaScript Developer Professional Certificate',
  'IBM Skills Network Team'),
 ('IBM Back-End Development Professional Certificate', 'John Rofrano')]

In [58]:
def sort_by_value(record):
    return record[1]

(
    rdd_course_body
        .filter(filter_product)
        .map(count_len)
        .sortBy(sort_by_value,ascending = False)
        .take(10)
)


                                                                                

[('Meta AR Developer  Professional Certificate', 'Taught by Meta Staff'),
 ('Meta Database Engineer Professional Certificate', 'Taught by Meta Staff'),
 ('Meta iOS Developer Professional Certificate', 'Taught by Meta Staff'),
 ('Meta Android Developer Professional Certificate', 'Taught by Meta Staff'),
 ('Meta Back-End Developer Professional Certificate', 'Taught by Meta Staff'),
 ('Meta Front-End Developer Professional Certificate', 'Taught by Meta Staff'),
 ('Akamai Customer Consulting and Support Professional Certificate',
  'Shadow Farrell'),
 ('Akamai Network Engineering Professional Certificate', 'Shadow Farrell'),
 ('AWS Cloud Solutions Architect  Professional Certificate', 'Morgan Willis'),
 ('AWS Cloud Technology Consultant Professional Certificate', 'Morgan Willis')]

### Reduce

In [59]:
def my_sum(x,y):
    return x + y

In [60]:
(
    rdd_course_body
        .map(count_len)
        .reduceByKey(my_sum)
        .sortBy(sort_by_value,ascending = False)
        .take(10)
)

                                                                                

[('Meta AR Developer  Professional Certificate', 'Taught by Meta Staff'),
 ('Meta Database Engineer Professional Certificate', 'Taught by Meta Staff'),
 ('Meta iOS Developer Professional Certificate', 'Taught by Meta Staff'),
 ('Meta Android Developer Professional Certificate', 'Taught by Meta Staff'),
 ('Meta Back-End Developer Professional Certificate', 'Taught by Meta Staff'),
 ('Meta Front-End Developer Professional Certificate', 'Taught by Meta Staff'),
 ('Akamai Customer Consulting and Support Professional Certificate',
  'Shadow Farrell'),
 ('Akamai Network Engineering Professional Certificate', 'Shadow Farrell'),
 ('AWS Cloud Solutions Architect  Professional Certificate', 'Morgan Willis'),
 ('AWS Cloud Technology Consultant Professional Certificate', 'Morgan Willis')]

# World Count  bằng rdd

In [61]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

def count_word(row):
    name_count = row.name
    for word in name_count.split(" "):
        yield (word,1)

def my_sum(x,y):
    return x + y

spark = ( 
    SparkSession.builder
            .master("yarn") 
            .appName('Word Count')
            .getOrCreate()
)

schema = types.StructType([
    types.StructField('url', types.StringType(), True),
    types.StructField('name', types.StringType(), True),
    types.StructField('language', types.StringType(), True),
    types.StructField('description', types.StringType(), True),
    types.StructField('instructors', types.StringType(), True),
    types.StructField('learn', types.StringType(), True),
    types.StructField('level', types.StringType(), True),
    types.StructField('time', types.StringType(), True)
])

data_storage = "gs://course_crawl/2024/03/19/LeVanDuy_First.parquet"
data_write = "gs://course_crawl/2024/03/19/word_count"

df = spark.read.schema(schema).parquet(data_storage)
rdd_course_body = df.select("name").rdd
(
    rdd_course_body
        .flatMap(count_word)
        .reduceByKey(my_sum)
        .saveAsTextFile(data_write)
)


24/03/19 14:33:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [62]:
from google.cloud import storage

def download_blob(bucket_name, source_blob_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    print(blob.download_as_text())

bucket_name = "course_crawl"
source_blob_name = "2024/03/19/word_count/part-00000"
download_blob(bucket_name, source_blob_name)


('Google', 11)
('Cybersecurity', 5)
('Professional', 34)
('Certificate', 34)
('Preparing', 7)
('for', 7)
('Cloud', 15)
('Certification:', 7)
('Architect', 2)
('Security', 1)
('Engineer', 6)
('Network', 2)
('DevOps', 3)
('Machine', 1)
('Learning', 1)
('IT', 4)
('Automation', 1)
('with', 1)
('Python', 1)
('Developer', 9)
('AWS', 2)
('Solutions', 1)
('', 5)
('IBM', 10)
('Applied', 1)
('Engineering', 3)
('Meta', 6)
('AR', 1)
('Akamai', 2)
('Customer', 1)
('Consulting', 1)
('and', 3)
('Support', 3)
('Palo', 1)
('Alto', 1)
('Networks', 1)
('Full-Stack', 1)
('JavaScript', 1)
('Back-End', 2)
('Development', 1)
('Front-End', 2)
('Database', 1)
('Project', 1)
('Manager', 1)
('ISC2', 1)
('Specialist', 1)
('iOS', 1)
('Android', 1)
('Technology', 2)
('Consultant', 2)
('SAP', 1)
('Software', 2)
('Microsoft', 1)
('Analyst', 2)
('Full', 1)
('Stack', 1)
('UX', 1)
('Design', 1)

