In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

In [None]:
import os
import findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

findspark.init()

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
from pyspark import SparkConf
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

config = SparkConf().setMaster('local').setAppName('lab53')
spark = SparkSession.builder.config(conf=config).getOrCreate()
sc = spark.sparkContext

DATASET_PATH = '/content/gdrive/MyDrive/survey.csv'

In [None]:
survey_df = spark.read \
  .format('csv') \
  .option("header", "true") \
  .option("inferSchema","true") \
  .option("samplingRatio", "0.0001") \
  .load(DATASET_PATH)

survey_df.printSchema()
survey_df.show(5)

Bạn sẽ cần tạo 1 hàm để đồng nhất giá trị cho trường Gender, hàm này sẽ chỉ trả về 3 giá trị là Female, Male và Unknown

In [None]:
import re
def parse_gender(gender):
    female_pattern = r"^f$|f.m|w.m"
    male_pattern = r"^m$|ma|m.l"
    if re.search(female_pattern, gender.lower()):
      return "Female"
    elif re.search(male_pattern, gender.lower()):
      return "Male"
    else:
      return "Unknown"

Sử dụng **Object Expression** để áp dụng UDF cho dữ liệu.

In [None]:
parse_gender_udf = udf(parse_gender, returnType=StringType())
print("Catalog Entry:")
[print(r) for r in spark.catalog.listFunctions() if "parse_gender" in r.name]

survey_df2 = survey_df.withColumn("Gender", parse_gender_udf("Gender"))
survey_df2.show(10)

Sử dụng **String Expression** để áp dụng UDF cho dữ liệu

In [None]:
spark.udf.register("parse_gender_udf", parse_gender, StringType())
print("Catalog Entry:")
[print(r) for r in spark.catalog.listFunctions() if "parse_gender" in r.name]

survey_df3 = survey_df.withColumn("Gender", expr("parse_gender_udf(Gender)"))
survey_df3.show(10)