In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

In [None]:
import os
import findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

findspark.init()

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DateType, StringType, IntegerType
from pyspark.sql.functions import spark_partition_id

config = SparkConf().setMaster('local').setAppName('lab22')
spark = SparkSession.builder.config(conf=config).getOrCreate()
sc = spark.sparkContext

DATASET_PATH = '/content/gdrive/MyDrive/flight-time.csv'

Đọc dữ liệu bằng cách sử dụng StructType Schema

In [None]:
flight_schema_struct = StructType([
    StructField('FL_DATE', DateType()),
    StructField('OP_CARRIER', StringType()),
    StructField('OP_CARRIER_FL_NUM', IntegerType()),
    StructField('ORIGIN', StringType()),
    StructField('ORIGIN_CITY_NAME', StringType()),
    StructField('DEST', StringType()),
    StructField('DEST_CITY_NAME', StringType()),
    StructField('CRS_DEP_TIME', IntegerType()),
    StructField('DEP_TIME', IntegerType()),
    StructField('WHEELS_ON', IntegerType()),
    StructField('TAXI_IN', IntegerType()),
    StructField('CRS_ARR_TIME', IntegerType()),
    StructField('ARR_TIME', IntegerType()),
    StructField('CANCELLED', IntegerType()),
    StructField('DISTANCE', IntegerType()),
])
print('Schema by StrucType')
flight_time_csv_DF = spark.read.format('csv').option('header','true').option('mode','FAILFAST').option('dateFormat', 'M/d/y').schema(flight_schema_struct).load(DATASET_PATH)
flight_time_csv_DF.show(5)

Schema by StrucType
+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|ORIGIN_CITY_NAME|DEST|DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|CANCELLED|DISTANCE|
+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|2000-01-01|        DL|             1451|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1115|    1113|     1343|      5|        1400|    1348|        0|     946|
|2000-01-01|        DL|             1479|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1315|    1311|     1536|      7|        1559|    1543|        0|     946|
|2000-01-01|        DL|             1857|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1415|    1414|     1642|      9|        1721|    165

Đọc dữ liệu bằng cách sử dụng String Schema

In [None]:
flight_schema_string = '''FL_DATE DATE, OP_CARRIER STRING, OP_CARRIER_FL_NUM INT, ORIGIN STRING, ORIGIN_CITY_NAME STRING, DEST STRING, DEST_CITY_NAME STRING, CRS_DEP_TIME INT, DEP_TIME INT, WHEELS_ON INT, TAXI_IN INT, CRS_ARR_TIME INT, ARR_TIME INT, CANCELLED INT, DISTANCE INT'''
print('Schema by StrucType')
flight_time_csv_DF = spark.read.format('csv').option('header','true').option('mode','FAILFAST').option('dateFormat', 'M/d/y').schema(flight_schema_string).load(DATASET_PATH)
flight_time_csv_DF.show(5)

Schema by StrucType
+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|ORIGIN_CITY_NAME|DEST|DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|CANCELLED|DISTANCE|
+----------+----------+-----------------+------+----------------+----+--------------+------------+--------+---------+-------+------------+--------+---------+--------+
|2000-01-01|        DL|             1451|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1115|    1113|     1343|      5|        1400|    1348|        0|     946|
|2000-01-01|        DL|             1479|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1315|    1311|     1536|      7|        1559|    1543|        0|     946|
|2000-01-01|        DL|             1857|   BOS|      Boston, MA| ATL|   Atlanta, GA|        1415|    1414|     1642|      9|        1721|    165

Phân vùng lại cho dữ liệu và lưu dữ liệu đó ra file. Đầu tiên hãy phân vùng dữ liệu thành 5 vùng.

In [None]:
flight_time_csv_DF.groupBy(spark_partition_id()).count().show()
print("Num Partitions before: " + str(flight_time_csv_DF.rdd.getNumPartitions()))

partitionedDF = flight_time_csv_DF.repartition(5)
print("Num Partitions after: " + str(partitionedDF.rdd.getNumPartitions()))
partitionedDF.groupBy(spark_partition_id()).count().show()

partitionedDF.write.format('json').mode('overwrite').option('path', '/content/gdrive/MyDrive/flights').save()

+--------------------+------+
|SPARK_PARTITION_ID()| count|
+--------------------+------+
|                   0|470477|
+--------------------+------+

Num Partitions before: 1
Num Partitions after: 5
+--------------------+-----+
|SPARK_PARTITION_ID()|count|
+--------------------+-----+
|                   0|94096|
|                   1|94095|
|                   2|94095|
|                   3|94095|
|                   4|94096|
+--------------------+-----+



Phân vùng lại theo các trường. Hãy hoàn thiện đoạn code để phân vùng theo hai trường là `"OP_CARRIER"` và `"ORIGIN"`

In [None]:
# partitionedDF.write.format('json').mode('overwrite').option('path', '/content/gdrive/MyDrive/flights').partitionBy('OP_CARRIER', 'ORIGIN').save()