Read data File and Create Data Schema

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark=SparkSession.builder.appName("Data Processing").getOrCreate()

In [4]:
df=spark.read.csv("C:/Users/User/Desktop/SparkFolder/Data/raw-flight-data.csv",inferSchema=True,header=True)

In [5]:
df.show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
|        19|        5|     DL|          10397|        15016|      -1|     -19|
|        19|        5|     DL|          15016|        10397|       0|      -1|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|      

In [6]:
df.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)



Redefine Schema for Flights

In [15]:
from pyspark.sql.types import *
flightschema=StructType([

                            StructField("DayofMonth",IntegerType(), False),
                            StructField("DayOfWeek",IntegerType(), False),
                            StructField("Carrier",StringType(), False),
                            StructField("OriginAirportID",IntegerType(), False),
                            StructField("DestAirportID",IntegerType(), False),
                            StructField("DepDelay",IntegerType(), False),
                            StructField("ArrDelay",IntegerType(), False)
])

In [16]:
df=spark.read.csv("C:/Users/User/Desktop/SparkFolder/Data/raw-flight-data.csv",schema=flightschema,header=True)

In [17]:
df.show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
|        19|        5|     DL|          10397|        15016|      -1|     -19|
|        19|        5|     DL|          15016|        10397|       0|      -1|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|      

Load the Airport Data

In [18]:
df1=spark.read.csv("C:/Users/User/Desktop/SparkFolder/Data/airports.csv",inferSchema=True,header=True)

In [19]:
df1.show(5)

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
|     10304|      Aniak|   AK|       Aniak Airport|
|     10754|     Barrow|   AK|Wiley Post/Will R...|
|     10551|     Bethel|   AK|      Bethel Airport|
+----------+-----------+-----+--------------------+
only showing top 5 rows



In [21]:
df1.printSchema()

root
 |-- airport_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- name: string (nullable = true)



Redefine Schema for Airports

In [22]:
airportschema=StructType([

                            StructField("airport_id",IntegerType(), False),
                            StructField("city",StringType(), False),
                            StructField("state",StringType(), False),
                            StructField("name",StringType(), False)
                            
])

In [23]:
df1=spark.read.csv("C:/Users/User/Desktop/SparkFolder/Data/airports.csv",schema=airportschema,header=True)

In [25]:
df1.printSchema()

root
 |-- airport_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- name: string (nullable = true)



In [26]:
df1.show(5)

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
|     10304|      Aniak|   AK|       Aniak Airport|
|     10754|     Barrow|   AK|Wiley Post/Will R...|
|     10551|     Bethel|   AK|      Bethel Airport|
+----------+-----------+-----+--------------------+
only showing top 5 rows



Merge the Two Data to show the flights from each city

In [33]:
flightbyOrigin=df1.join(df,df["OriginAirportID"]==df1["airport_id"]).groupBy("city").count()

In [35]:
flightbyOrigin.show()

+-----------------+------+
|             city| count|
+-----------------+------+
|          Phoenix| 90281|
|            Omaha| 13537|
|   Raleigh/Durham| 28436|
|        Anchorage|  7777|
|           Dallas| 19503|
|          Oakland| 25503|
|      San Antonio| 23090|
|     Philadelphia| 47659|
|       Louisville| 10953|
|Dallas/Fort Worth|105024|
|      Los Angeles|118684|
|       Sacramento| 25193|
|     Indianapolis| 18099|
|        Cleveland| 25261|
|        San Diego| 45783|
|    San Francisco| 84675|
|        Nashville| 34927|
|    Oklahoma City| 13967|
|          Detroit| 62879|
|         Portland| 30640|
+-----------------+------+
only showing top 20 rows



Handling Duplicated Data
* Drop duplicated and calculate duplicated Data

In [39]:
#Total number of rows
n1=df.count()
print("The number of rows is :",n1)
#Drop Duplicates
n2=df.dropDuplicates().count()
print("The number of duplicates dropped :",n2)
n3=n1-n2
print("The number of duplicated data :", n3)

The number of rows is : 2719418
The number of duplicates dropped : 2696983
The number of duplicated data : 22435


Create DataFrame from Tuples

In [52]:
from pyspark.sql.functions import *
df2=spark.createDataFrame([
                        ("Sol", 54,58),
                        ("Sol", 57,55),
                        ("Sol", 54,58)],
                        ["Name","age","height"])

In [53]:
# df2=df2.withColumn("ID",monotonically_increasing_id())

In [54]:
df2.show()
df2.dropDuplicates().show()

+----+---+------+
|Name|age|height|
+----+---+------+
| Sol| 54|    58|
| Sol| 57|    55|
| Sol| 54|    58|
+----+---+------+

+----+---+------+
|Name|age|height|
+----+---+------+
| Sol| 54|    58|
| Sol| 57|    55|
+----+---+------+



In [59]:
df2.dropDuplicates(["Name"]).show()

+----+---+------+
|Name|age|height|
+----+---+------+
| Sol| 54|    58|
+----+---+------+



Handling Missing Data
* Delete row iff there is at least one ( column) missing Data