# Lectura y Escritura de Archivos con PySpark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, DateType, IntegerType, StringType, DoubleType, BooleanType

from hdfs import InsecureClient

In [3]:
client = InsecureClient("http://172.16.200.3", user="jmanuelc87")

In [4]:
# filepath = "hdfs://172.16.200.3/user/jmanuelc87/data/flights"
filepath = "./data/flights"
filenames = [ f"Combined_Flights_{y}.csv" for y in range(2018, 2019) ]

In [5]:
spark = SparkSession.builder \
                .appName("flights") \
                .config("spark.executor.memory", "5g") \
                .config("spark.driver.memory", "2g") \
                .getOrCreate()

25/04/29 19:14:15 WARN Utils: Your hostname, onyx-pro-m4.local resolves to a loopback address: 127.0.0.1; using 192.168.0.66 instead (on interface en0)
25/04/29 19:14:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/29 19:14:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/29 19:14:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
schema = StructType(
    [
        StructField("FlightDate", DateType(), True),
        StructField("Airline", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Dest", StringType(), True),
        StructField("Cancelled", BooleanType(), True),
        StructField("Diverted", BooleanType(), True),
        StructField("CRSDepTime", IntegerType(), True),
        StructField("DepTime", DoubleType(), True),
        StructField("DepDelayMinutes", DoubleType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("ArrTime", DoubleType(), True),
        StructField("ArrDelayMinutes", DoubleType(), True),
        StructField("AirTime", DoubleType(), True),
        StructField("CRSElapsedTime", DoubleType(), True),
        StructField("ActualElapsedTime", DoubleType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("Year", IntegerType(), True),
        StructField("Quarter", IntegerType(), True),
        StructField("Month", IntegerType(), True),
        StructField("DayofMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("Marketing_Airline_Network", StringType(), True),
        StructField("Operated_or_Branded_Code_Share_Partners", StringType(), True),
        StructField("DOT_ID_Marketing_Airline", IntegerType(), True),
        StructField("IATA_Code_Marketing_Airline", StringType(), True),
        StructField("Flight_Number_Marketing_Airline", IntegerType(), True),
        StructField("Operating_Airline", StringType(), True),
        StructField("DOT_ID_Operating_Airline", IntegerType(), True),
        StructField("IATA_Code_Operating_Airline", StringType(), True),
        StructField("Tail_Number", StringType(), True),
        StructField("Flight_Number_Operating_Airline", IntegerType(), True),
        StructField("OriginAirportID", IntegerType(), True),
        StructField("OriginAirportSeqID", IntegerType(), True),
        StructField("OriginCityMarketID", IntegerType(), True),
        StructField("OriginCityName", StringType(), True),
        StructField("OriginState", StringType(), True),
        StructField("OriginStateFips", IntegerType(), True),
        StructField("OriginStateName", StringType(), True),
        StructField("OriginWac", IntegerType(), True),
        StructField("DestAirportID", IntegerType(), True),
        StructField("DestAirportSeqID", IntegerType(), True),
        StructField("DestCityMarketID", IntegerType(), True),
        StructField("DestCityName", StringType(), True),
        StructField("DestState", StringType(), True),
        StructField("DestStateFips", IntegerType(), True),
        StructField("DestStateName", StringType(), True),
        StructField("DestWac", IntegerType(), True),
        StructField("DepDel15", DoubleType(), True),
        StructField("DepartureDelayGroups", DoubleType(), True),
        StructField("DepTimeBlk", StringType(), True),
        StructField("TaxiOut", DoubleType(), True),
        StructField("WheelsOff", DoubleType(), True),
        StructField("WheelsOn", DoubleType(), True),
        StructField("TaxiIn", DoubleType(), True),
        StructField("CRSArrTime", IntegerType(), True),
        StructField("ArrDelay", DoubleType(), True),
        StructField("ArrDel15", DoubleType(), True),
        StructField("ArrivalDelayGroups", DoubleType(), True),
        StructField("ArrTimeBlk", StringType(), True),
        StructField("DistanceGroup", IntegerType(), True),
        StructField("DivAirportLandings", DoubleType(), True),
    ]
)

In [None]:
df = spark.read.schema(schema).csv(f"{filepath}/{filenames[0]}", header=True)

Show some descriptive statistics

In [7]:
df.describe([ el.name for el in df.schema if isinstance(el.dataType, (DoubleType, IntegerType)) ]).show()

25/04/29 17:33:22 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+-------+------------------+------------------+------------------+------------------+------------------------+-------------------------------+------------------------+-------------------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+-------------------+--------------------+-----------------+------------------+-----------------+-----------------+-----------------+------------------+-------------------+-------------------+------------------+--------------------+
|summary|        CRSDepTime|           DepTime|   DepDelayMinutes|          DepDelay|          ArrTime|   ArrDelayMinutes|           AirTime|    CRSElapsedTime|ActualElapsedTime|         D

In [8]:
df_size = df.count()
print(f"Size: {df_size:,}")

Size: 5,689,512


                                                                                

In [9]:
df.select(
    [F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]
).show()



+----------+-------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+
|FlightDate|Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|

                                                                                