# Exploring Columns

In [1]:
# Importing the necessary tables
import findspark
import pyspark

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import to_date, col, column, expr, dayofmonth, month, year
from pyspark.sql.types import StructType, StructField, DateType, StringType

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
# Reading all data
airlineDF = spark.read\
                .format('csv')\
                .option('header', True)\
                .option('inferSchema', True)\
                .option('samplingRatio', 0.0001)\
                .load('Spark-warehouse/airline_db.db/flight_data_tbl/')
airlineDF.show(5)

+-------------------+----------+-----------------+------+----------------+----+--------------------+------------+--------+---------+-------+------------+--------+---------+--------+
|            FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|ORIGIN_CITY_NAME|DEST|      DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|CANCELLED|DISTANCE|
+-------------------+----------+-----------------+------+----------------+----+--------------------+------------+--------+---------+-------+------------+--------+---------+--------+
|2000-01-01 00:00:00|        AA|             1407|   BHM|  Birmingham, AL| DFW|Dallas/Fort Worth...|        1256|    1252|     1447|     13|        1500|    1500|        0|     597|
|2000-01-01 00:00:00|        AA|             1689|   BHM|  Birmingham, AL| DFW|Dallas/Fort Worth...|        1556|    1556|     1745|     12|        1800|    1757|        0|     597|
|2000-01-01 00:00:00|        AA|             1939|   BHM|  Birmingham, AL| DFW|Dallas/Fort

In [5]:
# Reading just one partition
airlineDF = spark.read\
                .format('csv')\
                .option('header', True)\
                .option('inferSchema', True)\
                .option('samplingRatio', 0.0001)\
                .load('Spark-warehouse/airline_db.db/flight_data_tbl/part-00000-64b3c6f7-d6b0-4d42-9dfb-90ab8a7423f8_00000.c000.csv')

In [6]:
type(airlineDF), len(airlineDF.columns)

(pyspark.sql.dataframe.DataFrame, 15)

In [7]:
airlineDF.show(2)

+-------------------+----------+-----------------+------+----------------+----+--------------------+------------+--------+---------+-------+------------+--------+---------+--------+
|            FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|ORIGIN_CITY_NAME|DEST|      DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|CANCELLED|DISTANCE|
+-------------------+----------+-----------------+------+----------------+----+--------------------+------------+--------+---------+-------+------------+--------+---------+--------+
|2000-01-01 00:00:00|        AA|              438|   ABQ| Albuquerque, NM| DFW|Dallas/Fort Worth...|         914|     911|     1133|     12|        1157|    1145|        0|     569|
|2000-01-01 00:00:00|        AA|             1166|   ABQ| Albuquerque, NM| DFW|Dallas/Fort Worth...|        1650|    1643|     1915|     13|        1931|    1928|        0|     569|
+-------------------+----------+-----------------+------+----------------+----+-----------

## Accessing the columns

* Column string
* Column object

In [8]:
print(airlineDF.columns)

['FL_DATE', 'OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'ORIGIN_CITY_NAME', 'DEST', 'DEST_CITY_NAME', 'CRS_DEP_TIME', 'DEP_TIME', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'CANCELLED', 'DISTANCE']


In [9]:
airlineDF.printSchema()

root
 |-- FL_DATE: timestamp (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- ARR_TIME: integer (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)



In [10]:
# Accessing using column string - simplest method
airlineDF.select('Origin', 'Dest', 'Distance').show(2)

+------+----+--------+
|Origin|Dest|Distance|
+------+----+--------+
|   ABQ| DFW|     569|
|   ABQ| DFW|     569|
+------+----+--------+
only showing top 2 rows



In [11]:
# dir(airlineDF)
# Accessing the column using the column object
airlineDF.select(column('Origin'), col('Dest'), airlineDF['Distance']).show(2)

+------+----+--------+
|Origin|Dest|Distance|
+------+----+--------+
|   ABQ| DFW|     569|
|   ABQ| DFW|     569|
+------+----+--------+
only showing top 2 rows



## Columns expressions

* String Expressions or SQL Expressions
* Column Object Expressions

Help: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html

In [12]:
# Using string Expressions or SQL Expressions
airlineDF.select('Origin', 'Dest', 'Distance', 'FL_DATE', 
                 expr('day(FL_DATE) as Day'),
                 expr('month(FL_DATE) as Month'),
                 expr('year(FL_DATE) as Year')).show(10)

+------+----+--------+-------------------+---+-----+----+
|Origin|Dest|Distance|            FL_DATE|Day|Month|Year|
+------+----+--------+-------------------+---+-----+----+
|   ABQ| DFW|     569|2000-01-01 00:00:00|  1|    1|2000|
|   ABQ| DFW|     569|2000-01-01 00:00:00|  1|    1|2000|
|   ABQ| DFW|     569|2000-01-01 00:00:00|  1|    1|2000|
|   ABQ| DFW|     569|2000-01-01 00:00:00|  1|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
+------+----+--------+-------------------+---+-----+----+
only showing top 10 rows



In [13]:
# Using column expressions
airlineDF.select(column('Origin'), col('Dest'), airlineDF['Distance'], airlineDF['FL_DATE'], 
                 dayofmonth(airlineDF['FL_DATE']).alias('Day'),
                 month(airlineDF['FL_DATE']).alias('Month'),
                 year(airlineDF['FL_DATE']).alias('Year')).show(10)

+------+----+--------+-------------------+---+-----+----+
|Origin|Dest|Distance|            FL_DATE|Day|Month|Year|
+------+----+--------+-------------------+---+-----+----+
|   ABQ| DFW|     569|2000-01-01 00:00:00|  1|    1|2000|
|   ABQ| DFW|     569|2000-01-01 00:00:00|  1|    1|2000|
|   ABQ| DFW|     569|2000-01-01 00:00:00|  1|    1|2000|
|   ABQ| DFW|     569|2000-01-01 00:00:00|  1|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
|   ABQ| DFW|     569|2000-01-02 00:00:00|  2|    1|2000|
+------+----+--------+-------------------+---+-----+----+
only showing top 10 rows



In [14]:
spark.stop()