In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, when, sum, trim
from pyspark.sql.types import StringType
from data_class.Jesmine_fetchWordData import getWordData

# Initialize Spark session
spark = SparkSession.builder.appName("DataProcessing").getOrCreate()

24/12/21 18:46:31 WARN Utils: Your hostname, MSI. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/12/21 18:46:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/21 18:46:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/21 18:46:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/21 18:46:32 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
# Load the CSV file into a DataFrame
raw_word_df = spark.read.csv("cleaned_word.csv", header=True, inferSchema=True)

In [3]:
# Show the DataFrame to understand its structure
raw_word_df.show()
print(f"Total number of rows: {raw_word_df.count()}")

+--------------+
|          Kata|
+--------------+
|             a|
|            aa|
|           aan|
|         aaron|
|     aaronwooi|
|            ab|
|          abad|
|       abcclio|
|           abd|
|         abdul|
|      abdullah|
|      abkhazia|
|           abm|
|        abodes|
|         about|
|      aboutcom|
|            ac|
|      academic|
|       academy|
|acanthocephala|
+--------------+
only showing top 20 rows

Total number of rows: 7924


# Fetch Meaning

In [4]:
fetch_meaning_udf = udf(getWordData.fetch_meaning, StringType())
meaning_df = raw_word_df.withColumn("Maksud", fetch_meaning_udf(col("Kata")))
meaning_df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+--------------+--------------------+
|          Kata|              Maksud|
+--------------+--------------------+
|             a|(sistem SI) atto-...|
|            aa|tidak.\nAa oku el...|
|           aan|            berhenti|
|         aaron|                NULL|
|     aaronwooi|                NULL|
|            ab|                NULL|
|          abad|Tempoh masa selam...|
|       abcclio|                NULL|
|           abd|                NULL|
|         abdul|                NULL|
|      abdullah|                NULL|
|      abkhazia|                NULL|
|           abm|                NULL|
|        abodes|                NULL|
|         about|                NULL|
|      aboutcom|                NULL|
|            ac|                NULL|
|      academic|                NULL|
|       academy|                NULL|
|acanthocephala|                NULL|
+--------------+--------------------+
only showing top 20 rows



                                                                                

In [5]:
filtered_df = meaning_df.filter((col("Maksud").isNotNull()) & (trim(col("Maksud")) != ""))
filtered_df.show()

[Stage 7:>                                                          (0 + 1) / 1]

+----------+--------------------+
|      Kata|              Maksud|
+----------+--------------------+
|         a|(sistem SI) atto-...|
|        aa|tidak.\nAa oku el...|
|       aan|            berhenti|
|      abad|Tempoh masa selam...|
|     acara|susunan perkara u...|
|    action|perbuatan, tindak...|
|activities|jamak kepada acti...|
|     acuan|bekas yang mempun...|
|        ad|di.\nAd saaw da r...|
|       ada|boleh didapati, w...|
|      adab|tingkah laku sert...|
|    adakah|Suatu kata soal y...|
|    adalah|sesungguhnya (dig...|
|  adaptasi|Penyesuaian denga...|
|      adat|peraturan atau tr...|
|       adi|(Sanskrit) yang p...|
|      adil|yang atau dengan ...|
|  adjacent|bersebelahan, ber...|
|  adjektif|         Kata sifat.|
|        ae|(Kelantan-Pattani...|
+----------+--------------------+
only showing top 20 rows



                                                                                

# Fetch Translation

In [6]:
fetch_translation_udf = udf(getWordData.fetch_translation, StringType())
translation_df = filtered_df.withColumn(
    "Terjemahan",
    when(col("Kata").isNotNull(), fetch_translation_udf(col("Kata")))
    .otherwise(None)
)
translation_df.show()

[Stage 8:>                                                          (0 + 1) / 1]

+----------+--------------------+----------+
|      Kata|              Maksud|Terjemahan|
+----------+--------------------+----------+
|         a|(sistem SI) atto-...|         a|
|        aa|tidak.\nAa oku el...|       aaa|
|       aan|            berhenti|       aan|
|      abad|Tempoh masa selam...|   century|
|     acara|susunan perkara u...|     event|
|    action|perbuatan, tindak...|    action|
|activities|jamak kepada acti...|activities|
|     acuan|bekas yang mempun...|      cast|
|        ad|di.\nAd saaw da r...|        ad|
|       ada|boleh didapati, w...| available|
|      adab|tingkah laku sert...|   reading|
|    adakah|Suatu kata soal y...|        do|
|    adalah|sesungguhnya (dig...|        am|
|  adaptasi|Penyesuaian denga...|adaptation|
|      adat|peraturan atau tr...|    custom|
|       adi|(Sanskrit) yang p...|       adi|
|      adil|yang atau dengan ...|      fair|
|  adjacent|bersebelahan, ber...|  adjacent|
|  adjektif|         Kata sifat.| adjective|
|        a

                                                                                

# Fetch Part of Speech

In [7]:
part_of_speech_udf = udf(getWordData.fetch_part_of_speech, StringType())
part_of_speech_df = translation_df.withColumn("Golongan_Kata", part_of_speech_udf(col("Kata")))
part_of_speech_df.show()

[Stage 9:>                                                          (0 + 1) / 1]

+----------+--------------------+----------+--------------+
|      Kata|              Maksud|Terjemahan| Golongan_Kata|
+----------+--------------------+----------+--------------+
|         a|(sistem SI) atto-...|         a|  Kata Nama Am|
|        aa|tidak.\nAa oku el...|       aaa|Kata Singkatan|
|       aan|            berhenti|       aan|  Kata Nama Am|
|      abad|Tempoh masa selam...|   century|  Kata Nama Am|
|     acara|susunan perkara u...|     event|  Kata Nama Am|
|    action|perbuatan, tindak...|    action|  Kata Nama Am|
|activities|jamak kepada acti...|activities|  Kata Nama Am|
|     acuan|bekas yang mempun...|      cast|  Kata Nama Am|
|        ad|di.\nAd saaw da r...|        ad|  Kata Nama Am|
|       ada|boleh didapati, w...| available|    Kata Kerja|
|      adab|tingkah laku sert...|   reading|  Kata Nama Am|
|    adakah|Suatu kata soal y...|        do|    Kata Tanya|
|    adalah|sesungguhnya (dig...|        am|   Kata Hubung|
|  adaptasi|Penyesuaian denga...|adaptat

                                                                                

Adjust Kata Singkatan based on Singkatan data in Redis - Gan Khai Li

In [8]:
import redis

r = redis.StrictRedis(host='localhost', port=6379, decode_responses=True)
abbreviations = {key: True for key in r.keys()}

#Broadcast abbreviations to all worker nodes
broadcast_abbreviations = spark.sparkContext.broadcast(abbreviations)

part_of_speech_df = part_of_speech_df.withColumn(
    "Golongan_Kata",
    when(col("Kata").isin(*broadcast_abbreviations.value.keys()), "Kata Singkatan")
    .otherwise(col("Golongan_Kata"))
)
part_of_speech_df.show()

[Stage 10:>                                                         (0 + 1) / 1]

+----------+--------------------+----------+--------------+
|      Kata|              Maksud|Terjemahan| Golongan_Kata|
+----------+--------------------+----------+--------------+
|         a|(sistem SI) atto-...|         a|  Kata Nama Am|
|        aa|tidak.\nAa oku el...|       aaa|Kata Singkatan|
|       aan|            berhenti|       aan|  Kata Nama Am|
|      abad|Tempoh masa selam...|   century|  Kata Nama Am|
|     acara|susunan perkara u...|     event|  Kata Nama Am|
|    action|perbuatan, tindak...|    action|  Kata Nama Am|
|activities|jamak kepada acti...|activities|  Kata Nama Am|
|     acuan|bekas yang mempun...|      cast|  Kata Nama Am|
|        ad|di.\nAd saaw da r...|        ad|  Kata Nama Am|
|       ada|boleh didapati, w...| available|    Kata Kerja|
|      adab|tingkah laku sert...|   reading|  Kata Nama Am|
|    adakah|Suatu kata soal y...|        do|    Kata Tanya|
|    adalah|sesungguhnya (dig...|        am|   Kata Hubung|
|  adaptasi|Penyesuaian denga...|adaptat

                                                                                

In [9]:
part_of_speech_df.printSchema()

root
 |-- Kata: string (nullable = true)
 |-- Maksud: string (nullable = true)
 |-- Terjemahan: string (nullable = true)
 |-- Golongan_Kata: string (nullable = true)



In [10]:
part_of_speech_df.coalesce(1)

output_csv_path = "lexicon.csv"
part_of_speech_df.write.mode("overwrite").option("header", "true").option("quote", "\"").option("escape", "\"").option("delimiter", ",").option("nullValue", "").csv(output_csv_path)

print(f"Lexicon saved to directory: {output_csv_path}")


[Stage 11:>                                                         (0 + 1) / 1]

Lexicon saved to directory: lexicon.csv


                                                                                

# Viewing Lexicon

In [11]:
lexicon_df = spark.read.csv(
    "lexicon.csv", 
    header=True,         
    inferSchema=True,   
    multiLine=True,     
    sep=",",
    quote='"',          
    escape='"'            
)
lexicon_df.show(2054)

+---------------+--------------------+--------------------+--------------------+
|           Kata|              Maksud|          Terjemahan|       Golongan_Kata|
+---------------+--------------------+--------------------+--------------------+
|              a|(sistem SI) atto-...|                   a|        Kata Nama Am|
|             aa|tidak.\nAa oku el...|                 aaa|      Kata Singkatan|
|            aan|            berhenti|                 aan|        Kata Nama Am|
|           abad|Tempoh masa selam...|             century|        Kata Nama Am|
|          acara|susunan perkara u...|               event|        Kata Nama Am|
|         action|perbuatan, tindak...|              action|        Kata Nama Am|
|     activities|jamak kepada acti...|          activities|        Kata Nama Am|
|          acuan|bekas yang mempun...|                cast|        Kata Nama Am|
|             ad|di.\nAd saaw da r...|                  ad|        Kata Nama Am|
|            ada|boleh didap

In [12]:
count = lexicon_df.count()
print(f"Total words in lexicon: {count}")

Total words in lexicon: 2077


In [13]:
spark.stop()