# 02: Basic Transformations in PySpark

This notebook covers the following:
|#|Basics|
|--|---|
|1|Filtering Columns|
|2|Showing Columns|
|3|Showing head of dataframe|
|4|Filtering Columns|
|5|Dropping Columns|


### Imports

In [11]:
# Imports
import pyspark
import numpy as np
import pandas as pd
import os
from pyspark.sql import SparkSession

# Creating the spark session
spark = SparkSession.builder.appName("Practice").getOrCreate()


### Loading Dataframe and Creating Session

In [12]:
# This needs a new import. The datatypes you'll use are places after 'import'.
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Example schema definition (you need to adjust this to your actual CSV columns)
schema = StructType([
    StructField("index", IntegerType(), True),
    StructField("airline", StringType(), True),
    StructField("flight", StringType(), True),
    StructField("source_city", StringType(), True),
    StructField("departure_time", StringType(), True),
    StructField("stops", StringType(), True),
    StructField("arrival_time", StringType(), True),
    StructField("destination_city", StringType(), True),
    StructField("class", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("days_left", IntegerType(), True),  
    StructField("price", IntegerType(), True),   
])

# Load with predefined schema
df = spark.read.option("header", "true").schema(schema).csv("./datasets/airlines_flights_data.csv")

# Printing Schema
df.printSchema()

# If any start showing as null, it's probably because you skipped a column.

root
 |-- index: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)



---
## Basic Commands

#### Printing all Column Names

In [13]:
# Printing all column names
df.columns

['index',
 'airline',
 'flight',
 'source_city',
 'departure_time',
 'stops',
 'arrival_time',
 'destination_city',
 'class',
 'duration',
 'days_left',
 'price']

#### Head 

In [14]:
# Printing first 3 rows
df.head(5)

[Row(index=0, airline='SpiceJet', flight='SG-8709', source_city='Delhi', departure_time='Evening', stops='zero', arrival_time='Night', destination_city='Mumbai', class='Economy', duration=2.17, days_left=1, price=5953),
 Row(index=1, airline='SpiceJet', flight='SG-8157', source_city='Delhi', departure_time='Early_Morning', stops='zero', arrival_time='Morning', destination_city='Mumbai', class='Economy', duration=2.33, days_left=1, price=5953),
 Row(index=2, airline='AirAsia', flight='I5-764', source_city='Delhi', departure_time='Early_Morning', stops='zero', arrival_time='Early_Morning', destination_city='Mumbai', class='Economy', duration=2.17, days_left=1, price=5956),
 Row(index=3, airline='Vistara', flight='UK-995', source_city='Delhi', departure_time='Morning', stops='zero', arrival_time='Afternoon', destination_city='Mumbai', class='Economy', duration=2.25, days_left=1, price=5955),
 Row(index=4, airline='Vistara', flight='UK-963', source_city='Delhi', departure_time='Morning', s

#### Filtering columns

In [None]:
# Filtering a column
df.select('airline').show(5)

+--------+
| airline|
+--------+
|SpiceJet|
|SpiceJet|
| AirAsia|
| Vistara|
| Vistara|
+--------+
only showing top 5 rows


In [None]:
# Filtering columns -- Another way:

# Filtering multiple columns
df.select(['airline', 'flight']).show(5)

+--------+-------+
| airline| flight|
+--------+-------+
|SpiceJet|SG-8709|
|SpiceJet|SG-8157|
| AirAsia| I5-764|
| Vistara| UK-995|
| Vistara| UK-963|
+--------+-------+
only showing top 5 rows


#### Renaming Columns

In [17]:
# Renaming Columns
df_renamed = df.withColumnRenamed("FlightNum", "NewName").show(5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

#### Adding Columns

In [18]:
# Dependency for using literals
from pyspark.sql.functions import lit

# Adding columns to dataframe
df.withColumn('NewName', lit("Sup")).show(5)

# ''lit'' will create the same value for each row.

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+-------+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|NewName|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+-------+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|    Sup|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|    Sup|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|    Sup|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|    Sup|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Ec

In [None]:
# Adding columns -- Another way:

# You can do simple arithmetic when adding columns
df.withColumn('Duration Plus 2 hours', df['duration'] + 2 ).show(5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+---------------------+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|Duration Plus 2 hours|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+---------------------+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|                 4.17|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|                 4.33|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|                 4.17|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|               

#### Dropping Columns

In [20]:
# Dropping a column
df.drop('stops').show(5)

+-----+--------+-------+-----------+--------------+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+-----------+--------------+-------------+-------