In [1]:
import findspark
findspark.init('/home/ductien/spark-3.3.2-bin-hadoop3')
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, udf
import json
# use explode() function to handle array datatype
from pyspark.sql.functions import explode
from pyspark.sql.types import StringType

# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import isnan, isnull, when, count
from pyspark.sql.functions import split

<h1>Create Spark Session

In [2]:
spark = SparkSession.builder.appName('uniCleaner').getOrCreate()

23/04/27 04:44:06 WARN Utils: Your hostname, DT-Kubuntu resolves to a loopback address: 127.0.1.1; using 192.168.2.94 instead (on interface wlp3s0)
23/04/27 04:44:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/27 04:44:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/27 04:44:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


<h2>Read data

In [3]:
df = spark.read.json('./Datasets/university_diemchuan.json')
df.printSchema()



root
 |-- diemchuan_datas: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- major_code: string (nullable = true)
 |    |    |-- major_name: string (nullable = true)
 |    |    |-- note: string (nullable = true)
 |    |    |-- point: string (nullable = true)
 |    |    |-- subject_group: string (nullable = true)
 |    |    |-- year: long (nullable = true)
 |-- university_meta: struct (nullable = true)
 |    |-- university_code: string (nullable = true)
 |    |-- university_name: string (nullable = true)
 |    |-- url: string (nullable = true)



                                                                                

<h1> Reformating

In [4]:
formatedDF = df.withColumn('uni_code', col('university_meta.university_code'))\
        .withColumn('uni_name', col('university_meta.university_name'))\
        .drop('university_meta')\
        .withColumn('mark_ex', explode('diemchuan_datas'))\
        .withColumn('year', col('mark_ex.year'))\
        .withColumn('major_code', col('mark_ex.major_code'))\
        .withColumn('major_name', col('mark_ex.major_name'))\
        .withColumn('note', col('mark_ex.note'))\
        .withColumn('benchmark', col('mark_ex.point'))\
        .withColumn('subject_group', col('mark_ex.subject_group'))\
        .drop('diemchuan_datas', 'mark_ex')

formatedDF.printSchema()

root
 |-- uni_code: string (nullable = true)
 |-- uni_name: string (nullable = true)
 |-- year: long (nullable = true)
 |-- major_code: string (nullable = true)
 |-- major_name: string (nullable = true)
 |-- note: string (nullable = true)
 |-- benchmark: string (nullable = true)
 |-- subject_group: string (nullable = true)



In [5]:
formatedDF.filter(formatedDF.major_code == 'D01').count()

0

In [6]:
formatedDF.limit(10).show()

+--------+--------------------+----+----------+--------------------+--------------------+---------+-------------+
|uni_code|            uni_name|year|major_code|          major_name|                note|benchmark|subject_group|
+--------+--------------------+----+----------+--------------------+--------------------+---------+-------------+
|     DCH|Trường Sĩ Quan Đặ...|2017|   7860207|Chỉ huy tham mưu ...|                    |    23.25|          A00|
|     DCH|Trường Sĩ Quan Đặ...|2017|   7860207|Chỉ huy tham mưu ...|                    |    23.25|          A01|
|     DCH|Trường Sĩ Quan Đặ...|2017|   7860207|Chỉ huy tham mưu ...|Thí sinh mức 21,7...|    21.75|          A00|
|     DCH|Trường Sĩ Quan Đặ...|2017|   7860207|Chỉ huy tham mưu ...|Thí sinh mức 21,7...|    21.75|          A01|
|     DCH|Trường Sĩ Quan Đặ...|2018|   7860207|Thí sinh Nam miền...|                    |    21.05|          A00|
|     DCH|Trường Sĩ Quan Đặ...|2018|   7860207|Thí sinh Nam miền...|                    

<h2> Remove vietnamease accent

In [7]:
import unidecode

def remove_accent(text):
    return unidecode.unidecode(text).lower()

In [8]:
remove_accent_udf = udf(remove_accent, StringType())

In [9]:
formatedDF = formatedDF.withColumn('uni_name', remove_accent_udf(formatedDF['uni_name']))\
                       .withColumn('major_name', remove_accent_udf(formatedDF['major_name']))\
                       .withColumn('note', remove_accent_udf(formatedDF['note']))

In [10]:
formatedDF.show()

+--------+--------------------+----+----------+--------------------+--------------------+---------+---------------+
|uni_code|            uni_name|year|major_code|          major_name|                note|benchmark|  subject_group|
+--------+--------------------+----+----------+--------------------+--------------------+---------+---------------+
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|                    |    23.25|            A00|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|                    |    23.25|            A01|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|thi sinh muc 21,7...|    21.75|            A00|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|thi sinh muc 21,7...|    21.75|            A01|
|     DCH|truong si quan da...|2018|   7860207|thi sinh nam mien...|                    |    21.05|            A00|
|     DCH|truong si quan da...|2018|   7860207|thi sinh nam mien...|    

                                                                                

In [11]:
formatedDF.printSchema()

root
 |-- uni_code: string (nullable = true)
 |-- uni_name: string (nullable = true)
 |-- year: long (nullable = true)
 |-- major_code: string (nullable = true)
 |-- major_name: string (nullable = true)
 |-- note: string (nullable = true)
 |-- benchmark: string (nullable = true)
 |-- subject_group: string (nullable = true)



<h2> Refactor datatype and split subject_group

In [12]:
tmpDF = formatedDF.withColumn('list_subject_group', split('subject_group', ';'))

In [20]:
tmpDF = tmpDF.select("*", explode('list_subject_group').alias('single_subject_group'))

In [24]:
tmpDF = tmpDF.select("single_subject_group")

In [29]:
tmpDF.show()

+--------------------+
|single_subject_group|
+--------------------+
|                 A00|
|                 A01|
|                 A00|
|                 A01|
|                 A00|
|                 A01|
|                 A00|
|                 A01|
|                 A00|
|                 A01|
|                 A00|
|                 A01|
|                 A00|
|                 A01|
|                 A00|
|                 A01|
|                 A00|
|                 A01|
|                 A00|
|                 A01|
+--------------------+
only showing top 20 rows



In [30]:
subject_list = ['A00','A01','A02','A03','A04','A05','A06','A07','A08','A09','A10','A11',
                'B00','B01','B02','B03','B04','B08',
                'C00','C01','C02','C03','C04','C05','C06','C07','C08','C09',
                'D01','D07','D08','D09','D10','D11','D12','D13']

In [31]:
formatedDF = formatedDF.withColumn('benchmark', col('benchmark').cast('double'))\
                       .withColumn('year', col('year').cast('integer'))\
                       .withColumn('list_subject_group', split('subject_group', ';'))

In [32]:
formatedDF = formatedDF.select("*", explode('list_subject_group').alias('single_subject_group'))\
          .drop('list_subject_group')

In [33]:
formatedDF.printSchema()

root
 |-- uni_code: string (nullable = true)
 |-- uni_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- major_code: string (nullable = true)
 |-- major_name: string (nullable = true)
 |-- note: string (nullable = true)
 |-- benchmark: double (nullable = true)
 |-- subject_group: string (nullable = true)
 |-- single_subject_group: string (nullable = false)



In [39]:
formatedDF = formatedDF.filter(col('single_subject_group').isin(subject_list))

In [41]:
formatedDF.count()

38935

<h1> Check missing data and clean it

In [42]:
countNullDF = formatedDF.select([count(when(isnan(c) | isnull(c), c)).alias(c) for c in formatedDF.columns])

In [43]:
countNullDF.show()



+--------+--------+----+----------+----------+----+---------+-------------+--------------------+
|uni_code|uni_name|year|major_code|major_name|note|benchmark|subject_group|single_subject_group|
+--------+--------+----+----------+----------+----+---------+-------------+--------------------+
|       0|       0|   0|         0|         0|   0|     1642|            0|                   0|
+--------+--------+----+----------+----------+----+---------+-------------+--------------------+



                                                                                

In [44]:
(3546 / formatedDF.count()) * 100

9.107486837036086

In [45]:
finalDF = formatedDF.dropna(subset= formatedDF.columns)

In [46]:
finalDF.select([count(when(isnan(c) | isnull(c), c)).alias(c) for c in finalDF.columns]).show()



+--------+--------+----+----------+----------+----+---------+-------------+--------------------+
|uni_code|uni_name|year|major_code|major_name|note|benchmark|subject_group|single_subject_group|
+--------+--------+----+----------+----------+----+---------+-------------+--------------------+
|       0|       0|   0|         0|         0|   0|        0|            0|                   0|
+--------+--------+----+----------+----------+----+---------+-------------+--------------------+



                                                                                

In [47]:
finalDF.show()

+--------+--------------------+----+----------+--------------------+--------------------+---------+---------------+--------------------+
|uni_code|            uni_name|year|major_code|          major_name|                note|benchmark|  subject_group|single_subject_group|
+--------+--------------------+----+----------+--------------------+--------------------+---------+---------------+--------------------+
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|                    |    23.25|            A00|                 A00|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|                    |    23.25|            A01|                 A01|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|thi sinh muc 21,7...|    21.75|            A00|                 A00|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|thi sinh muc 21,7...|    21.75|            A01|                 A01|
|     DCH|truong si quan da...|2018|   78

<h2> Clean rows "Xet hoc ba"

In [48]:
finalDF = finalDF.filter(~finalDF.note.contains('xet hoc ba') &\
                       ~finalDF.note.contains('hoc ba') & \
                       ~finalDF.note.contains('hocba') &
                       ~finalDF.note.contains('xet tuyen hoc ba'))

In [49]:
finalDF.filter(finalDF.note.contains('hoc ba')).count()

0

In [50]:
finalDF.printSchema()

root
 |-- uni_code: string (nullable = true)
 |-- uni_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- major_code: string (nullable = true)
 |-- major_name: string (nullable = true)
 |-- note: string (nullable = true)
 |-- benchmark: double (nullable = true)
 |-- subject_group: string (nullable = true)
 |-- single_subject_group: string (nullable = false)



In [51]:
finalDF = finalDF.drop(finalDF.note)

In [52]:
finalDF.printSchema()

root
 |-- uni_code: string (nullable = true)
 |-- uni_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- major_code: string (nullable = true)
 |-- major_name: string (nullable = true)
 |-- benchmark: double (nullable = true)
 |-- subject_group: string (nullable = true)
 |-- single_subject_group: string (nullable = false)



In [53]:
finalDF.createOrReplaceTempView("TAB")
spark.sql("SELECT DISTINCT uni_code FROM TAB").count()

                                                                                

248

In [54]:
finalDF.show()

+--------+--------------------+----+----------+--------------------+---------+---------------+--------------------+
|uni_code|            uni_name|year|major_code|          major_name|benchmark|  subject_group|single_subject_group|
+--------+--------------------+----+----------+--------------------+---------+---------------+--------------------+
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|    23.25|            A00|                 A00|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|    23.25|            A01|                 A01|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|    21.75|            A00|                 A00|
|     DCH|truong si quan da...|2017|   7860207|chi huy tham muu ...|    21.75|            A01|                 A01|
|     DCH|truong si quan da...|2018|   7860207|thi sinh nam mien...|    21.05|            A00|                 A00|
|     DCH|truong si quan da...|2018|   7860207|thi sinh nam mien...|    

In [55]:
finalDF = finalDF.drop(col('subject_group'))

In [63]:
finalDF = finalDF.withColumnRenamed('single_subject_group', 'subject_group')

In [64]:
finalDF.printSchema()

root
 |-- uni_code: string (nullable = true)
 |-- uni_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- major_code: string (nullable = true)
 |-- major_name: string (nullable = true)
 |-- benchmark: double (nullable = true)
 |-- subject_group: string (nullable = false)



In [66]:
finalDF.count()

                                                                                

35126

In [69]:
finalDF.filter(col('major_code').isin(subject_list)).show()

+--------+--------+----+----------+----------+---------+-------------+
|uni_code|uni_name|year|major_code|major_name|benchmark|subject_group|
+--------+--------+----+----------+----------+---------+-------------+
+--------+--------+----+----------+----------+---------+-------------+



In [75]:
finalDF.filter(col('major_code') == '7349001') \
        .filter(col('uni_code') == 'DHK') \
        .filter(col('year') == 2022).show(5,False)

+--------+-----------------------------+----+----------+-------------------------------------------------------------------------------------------------+---------+-------------+
|uni_code|uni_name                     |year|major_code|major_name                                                                                       |benchmark|subject_group|
+--------+-----------------------------+----+----------+-------------------------------------------------------------------------------------------------+---------+-------------+
|DHK     |dai hoc kinh te - dai hoc hue|2022|7349001   |tai chinh - ngan hang\n(lien ket dao tao dong cap bang voi truong dai hoc rennes i, cong hoa phap|16.0     |A00          |
+--------+-----------------------------+----+----------+-------------------------------------------------------------------------------------------------+---------+-------------+



<h2> Remove \n\r

In [76]:
from pyspark.sql.functions import regexp_replace

In [77]:
finalDF.printSchema()

root
 |-- uni_code: string (nullable = true)
 |-- uni_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- major_code: string (nullable = true)
 |-- major_name: string (nullable = true)
 |-- benchmark: double (nullable = true)
 |-- subject_group: string (nullable = false)



In [78]:
finalDF = finalDF.withColumn("uni_code", regexp_replace(col("uni_code"), "\n|\r", "")) \
                 .withColumn("uni_name", regexp_replace(col("uni_name"), "\n|\r", "")) \
                 .withColumn("major_code", regexp_replace(col("major_code"), "\n|\r", "")) \
                 .withColumn("major_name", regexp_replace(col("major_name"), "\n|\r", ""))
                 

In [79]:
finalDF.count()

                                                                                

35126

<h1> Export to "CSV" file

In [80]:
# output = finalDF.coalesce(1)
finalDF.write.option('delimiter', ',').csv('./CleanedDatasets/cleaned_uni_mark', header = True)

                                                                                