In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 46 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 72.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=9edda67c90d1a85bb38e4c5ed280d9748a5267b26f3ce3bae6e654c933a1041c
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
from google.colab import files
uploads = files.upload()

Saving FarePrediction.csv to FarePrediction.csv


In [3]:
from pyspark.sql import SparkSession

In [4]:
# creating a spark session
sc = SparkSession.builder.appName('Fare Prediction').getOrCreate()

In [5]:
# reading the excel file
df = sc.read.csv('/content/FarePrediction.csv', header = True, inferSchema = True)
df.show(20)

+-----------------+---------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+
|          Airline|Date_of_Journey|  Source|Destination|               Route|           Dep_Time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|
+-----------------+---------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+
|           IndiGo|     24/03/2019|Banglore|  New Delhi|           BLR ? DEL|2022-12-30 22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|
|        Air India|      1/05/2019| Kolkata|   Banglore|CCU ? IXR ? BBI ?...|2022-12-30 05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|
|      Jet Airways|      9/06/2019|   Delhi|     Cochin|DEL ? LKO ? BOM ?...|2022-12-30 09:25:00|04:25 10 Jun|     19h|    2 stops|             No info|13882|
|           IndiGo|     12/05/2019| Kolkata|  

In [6]:
#df.printSchema()

### Data Cleaning and Feature Engineering

In [7]:
# converting string date to datetype
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType

str_date = udf(lambda x: datetime.strptime(x, '%d/%m/%Y'), DateType())
df1 = df.withColumn('Journey_date', str_date('Date_of_Journey')).drop(col('Date_of_Journey'))
df1.show()

+-----------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+------------+
|          Airline|  Source|Destination|               Route|           Dep_Time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|Journey_date|
+-----------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+------------+
|           IndiGo|Banglore|  New Delhi|           BLR ? DEL|2022-12-30 22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|  2019-03-24|
|        Air India| Kolkata|   Banglore|CCU ? IXR ? BBI ?...|2022-12-30 05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|  2019-05-01|
|      Jet Airways|   Delhi|     Cochin|DEL ? LKO ? BOM ?...|2022-12-30 09:25:00|04:25 10 Jun|     19h|    2 stops|             No info|13882|  2019-06-09|
|           IndiGo| Kolkata|   Banglore|     CCU ? NAG ? BLR|202

In [8]:
df1.printSchema()

root
 |-- Airline: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- Route: string (nullable = true)
 |-- Dep_Time: timestamp (nullable = true)
 |-- Arrival_Time: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Total_Stops: string (nullable = true)
 |-- Additional_Info: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Journey_date: date (nullable = true)



In [9]:
#Extact day, month, year from Journey_date feature and store them in new columns. 

from pyspark.sql.functions import *
from pyspark.sql.functions import year, month, dayofweek
df1 = df1.withColumn('day', dayofweek(col('Journey_date')))
df1 = df1.withColumn('month', month(col('Journey_date')))
df1 = df1.withColumn('year', year(col('Journey_date'))).drop(col('Journey_date'))

In [10]:
df1.show(20)

+-----------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+---+-----+----+
|          Airline|  Source|Destination|               Route|           Dep_Time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|day|month|year|
+-----------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+---+-----+----+
|           IndiGo|Banglore|  New Delhi|           BLR ? DEL|2022-12-30 22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|  1|    3|2019|
|        Air India| Kolkata|   Banglore|CCU ? IXR ? BBI ?...|2022-12-30 05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|  4|    5|2019|
|      Jet Airways|   Delhi|     Cochin|DEL ? LKO ? BOM ?...|2022-12-30 09:25:00|04:25 10 Jun|     19h|    2 stops|             No info|13882|  1|    6|2019|
|           IndiGo| Kolkata|   Banglore|     CCU ? N

In [11]:
# counting distinct values from year col
from pyspark.sql.functions import countDistinct
df2  =df1.select(countDistinct('year')).show()
# since year is the same for all the rows, we can drop it

+--------------------+
|count(DISTINCT year)|
+--------------------+
|                   1|
+--------------------+



In [12]:
df1 = df1.drop(col('year'))
df1.show()

+-----------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+---+-----+
|          Airline|  Source|Destination|               Route|           Dep_Time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|day|month|
+-----------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+---+-----+
|           IndiGo|Banglore|  New Delhi|           BLR ? DEL|2022-12-30 22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|  1|    3|
|        Air India| Kolkata|   Banglore|CCU ? IXR ? BBI ?...|2022-12-30 05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|  4|    5|
|      Jet Airways|   Delhi|     Cochin|DEL ? LKO ? BOM ?...|2022-12-30 09:25:00|04:25 10 Jun|     19h|    2 stops|             No info|13882|  1|    6|
|           IndiGo| Kolkata|   Banglore|     CCU ? NAG ? BLR|2022-12-30 18:05:00| 

In [13]:
from pyspark.sql.types import IntegerType
# Extracting hours and min from Arrival time, Departure time
df1 = df1.withColumn('Arrival_hour', split(df1['Arrival_Time'],':').getItem(0).cast('int'))\
                     .withColumn('dump', split(df1['Arrival_Time'],'\s\w*').getItem(0))\
                     .withColumn('time', split(df1['Dep_Time'],'\s').getItem(1))

df1 = df1.withColumn('dept_hour', split(df1['time'],':(\d+)').getItem(0))\
                    .withColumn('dept_min', split(df1['time'], '^\d+:').getItem(1))

df1 = df1.withColumn('dept_min', split(df1['dept_min'], ':(\d+)').getItem(0))      

df1 = df1.withColumn('Arrival_min', split(df1['dump'],':').getItem(1).cast('int')).drop('dump','time','Route','Additional_Info')                  



In [14]:
df1.show()

+-----------------+--------+-----------+-------------------+------------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+
|          Airline|  Source|Destination|           Dep_Time|Arrival_Time|Duration|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|
+-----------------+--------+-----------+-------------------+------------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+
|           IndiGo|Banglore|  New Delhi|2022-12-30 22:20:00|01:10 22 Mar|  2h 50m|   non-stop| 3897|  1|    3|           1|       22|      20|         10|
|        Air India| Kolkata|   Banglore|2022-12-30 05:50:00|       13:15|  7h 25m|    2 stops| 7662|  4|    5|          13|       05|      50|         15|
|      Jet Airways|   Delhi|     Cochin|2022-12-30 09:25:00|04:25 10 Jun|     19h|    2 stops|13882|  1|    6|           4|       09|      25|         25|
|           IndiGo| Kolkata|   Banglore|2022-12-30 18:05:00|       23:

In [15]:
#df1.printSchema()

In [16]:
df1 = df1.withColumn('Duration_hr', split(df1['Duration'], 'h').getItem(0).cast('int'))\
            .withColumn('dump', split(df1['Duration'], '\s').getItem(1)).drop(df1['Duration'])
df1 = df1.withColumn('Duration_min', split(df1['dump'],'m').getItem(0).cast('int')).drop(df1['dump'])
df1.show()

+-----------------+--------+-----------+-------------------+------------+-----------+-----+---+-----+------------+---------+--------+-----------+-----------+------------+
|          Airline|  Source|Destination|           Dep_Time|Arrival_Time|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_hr|Duration_min|
+-----------------+--------+-----------+-------------------+------------+-----------+-----+---+-----+------------+---------+--------+-----------+-----------+------------+
|           IndiGo|Banglore|  New Delhi|2022-12-30 22:20:00|01:10 22 Mar|   non-stop| 3897|  1|    3|           1|       22|      20|         10|          2|          50|
|        Air India| Kolkata|   Banglore|2022-12-30 05:50:00|       13:15|    2 stops| 7662|  4|    5|          13|       05|      50|         15|          7|          25|
|      Jet Airways|   Delhi|     Cochin|2022-12-30 09:25:00|04:25 10 Jun|    2 stops|13882|  1|    6|           4|       09|      25|         25|

In [17]:
type(df1)

pyspark.sql.dataframe.DataFrame

In [18]:
df1.filter(df1['Duration_min'].isNull()).count() # there are approx 1000 rows with null value, so here we will replace it to 0 min 

1032

In [19]:
df1.printSchema()

root
 |-- Airline: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- Dep_Time: timestamp (nullable = true)
 |-- Arrival_Time: string (nullable = true)
 |-- Total_Stops: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- Arrival_hour: integer (nullable = true)
 |-- dept_hour: string (nullable = true)
 |-- dept_min: string (nullable = true)
 |-- Arrival_min: integer (nullable = true)
 |-- Duration_hr: integer (nullable = true)
 |-- Duration_min: integer (nullable = true)



In [20]:
df1 = df1.fillna(value = 0, subset= ['Duration_min'])

In [21]:
df1.show()

+-----------------+--------+-----------+-------------------+------------+-----------+-----+---+-----+------------+---------+--------+-----------+-----------+------------+
|          Airline|  Source|Destination|           Dep_Time|Arrival_Time|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_hr|Duration_min|
+-----------------+--------+-----------+-------------------+------------+-----------+-----+---+-----+------------+---------+--------+-----------+-----------+------------+
|           IndiGo|Banglore|  New Delhi|2022-12-30 22:20:00|01:10 22 Mar|   non-stop| 3897|  1|    3|           1|       22|      20|         10|          2|          50|
|        Air India| Kolkata|   Banglore|2022-12-30 05:50:00|       13:15|    2 stops| 7662|  4|    5|          13|       05|      50|         15|          7|          25|
|      Jet Airways|   Delhi|     Cochin|2022-12-30 09:25:00|04:25 10 Jun|    2 stops|13882|  1|    6|           4|       09|      25|         25|

In [22]:
type(df1)

pyspark.sql.dataframe.DataFrame

In [23]:
df1 = df1.withColumn('Duration_(min)',(df1.Duration_hr * 60 + df1.Duration_min))
df1.show()

+-----------------+--------+-----------+-------------------+------------+-----------+-----+---+-----+------------+---------+--------+-----------+-----------+------------+--------------+
|          Airline|  Source|Destination|           Dep_Time|Arrival_Time|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_hr|Duration_min|Duration_(min)|
+-----------------+--------+-----------+-------------------+------------+-----------+-----+---+-----+------------+---------+--------+-----------+-----------+------------+--------------+
|           IndiGo|Banglore|  New Delhi|2022-12-30 22:20:00|01:10 22 Mar|   non-stop| 3897|  1|    3|           1|       22|      20|         10|          2|          50|           170|
|        Air India| Kolkata|   Banglore|2022-12-30 05:50:00|       13:15|    2 stops| 7662|  4|    5|          13|       05|      50|         15|          7|          25|           445|
|      Jet Airways|   Delhi|     Cochin|2022-12-30 09:25:00|04:25 10 J

In [24]:
df1 = df1.drop('Duration_hr','Duration_min','Dep_Time','Arrival_Time')

In [25]:
df1.show()

+-----------------+--------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|          Airline|  Source|Destination|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|
+-----------------+--------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|           IndiGo|Banglore|  New Delhi|   non-stop| 3897|  1|    3|           1|       22|      20|         10|           170|
|        Air India| Kolkata|   Banglore|    2 stops| 7662|  4|    5|          13|       05|      50|         15|           445|
|      Jet Airways|   Delhi|     Cochin|    2 stops|13882|  1|    6|           4|       09|      25|         25|          1140|
|           IndiGo| Kolkata|   Banglore|     1 stop| 6218|  1|    5|          23|       18|      05|         30|           325|
|           IndiGo|Banglore|  New Delhi|     1 stop|13302|  6|    3|          21|       16|      50|    

In [26]:

from pyspark.sql.functions import col,isnan,when,count
df_Columns=["Airline","Source","Destination","Total_Stops","Price","day","month","Arrival_hour","dept_hour","dept_min","Arrival_min","Duration_(min)"]
df1.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_Columns]).show()

+-------+------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|Airline|Source|Destination|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|
+-------+------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|      0|     0|          0|          1|    0|  0|    0|           0|        0|       0|          0|             1|
+-------+------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+



In [27]:
df1 = df1.na.drop('any')

In [28]:
df1.show()

+-----------------+--------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|          Airline|  Source|Destination|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|
+-----------------+--------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|           IndiGo|Banglore|  New Delhi|   non-stop| 3897|  1|    3|           1|       22|      20|         10|           170|
|        Air India| Kolkata|   Banglore|    2 stops| 7662|  4|    5|          13|       05|      50|         15|           445|
|      Jet Airways|   Delhi|     Cochin|    2 stops|13882|  1|    6|           4|       09|      25|         25|          1140|
|           IndiGo| Kolkata|   Banglore|     1 stop| 6218|  1|    5|          23|       18|      05|         30|           325|
|           IndiGo|Banglore|  New Delhi|     1 stop|13302|  6|    3|          21|       16|      50|    

In [29]:
df_Columns=["Airline","Source","Destination","Total_Stops","Price","day","month","Arrival_hour","dept_hour","dept_min","Arrival_min","Duration_(min)"]
df1.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_Columns]).show()

+-------+------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|Airline|Source|Destination|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|
+-------+------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|      0|     0|          0|          0|    0|  0|    0|           0|        0|       0|          0|             0|
+-------+------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+



In [30]:
# checking top 10 prefered airlines
res =df1.groupBy('Airline').count().alias('count').sort(desc('count')).limit(10)
res.show()

+--------------------+-----+
|             Airline|count|
+--------------------+-----+
|         Jet Airways| 3849|
|              IndiGo| 2053|
|           Air India| 1750|
|   Multiple carriers| 1196|
|            SpiceJet|  818|
|             Vistara|  479|
|            Air Asia|  319|
|               GoAir|  194|
|Multiple carriers...|   13|
|Jet Airways Business|    6|
+--------------------+-----+



In [31]:
# In SQL
df1.createOrReplaceTempView('dataframe')


In [32]:
result = sc.sql("SELECT Airline,\
COUNT(Airline)\
FROM dataframe \
GROUP BY Airline \
ORDER BY COUNT(Airline)\
DESC LIMIT 10 ").show()

+--------------------+--------------+
|             Airline|count(Airline)|
+--------------------+--------------+
|         Jet Airways|          3849|
|              IndiGo|          2053|
|           Air India|          1750|
|   Multiple carriers|          1196|
|            SpiceJet|           818|
|             Vistara|           479|
|            Air Asia|           319|
|               GoAir|           194|
|Multiple carriers...|            13|
|Jet Airways Business|             6|
+--------------------+--------------+



In [33]:
# extracting data with jetairways

jetAir = df1.filter(df1.Airline == 'Jet Airways')
jetAir.show()

+-----------+--------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|    Airline|  Source|Destination|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|
+-----------+--------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|Jet Airways|   Delhi|     Cochin|    2 stops|13882|  1|    6|           4|       09|      25|         25|          1140|
|Jet Airways|Banglore|  New Delhi|     1 stop|11087|  3|    3|          10|       18|      55|         25|           930|
|Jet Airways|Banglore|  New Delhi|     1 stop|22270|  6|    3|           5|       08|      00|          5|          1265|
|Jet Airways|Banglore|  New Delhi|     1 stop|11087|  3|    3|          10|       08|      55|         25|          1530|
|Jet Airways| Kolkata|   Banglore|     1 stop| 9663|  5|    5|           9|       21|      10|         20|           730|
|Jet Airways|   Delhi|  

In [34]:
from pyspark.sql import functions as F
from pyspark.sql.functions import rank, sum, col, desc
from pyspark.sql import Window

In [35]:
window = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

In [36]:
# price stats for different airlines
airways = df1.select(['Airline','Price']).\
            groupBy('Airline').\
            agg(
                F.min('Price').alias('min_price'),
                F.format_number(mean('Price'),2).alias('mean_price'),
                F.max('Price').alias('max_price'),
                F.format_number(stddev('Price'),2).alias('std_price')
            ).sort(desc(mean('Price')))

In [37]:
airways.toPandas()

Unnamed: 0,Airline,min_price,mean_price,max_price,std_price
0,Jet Airways Business,46490,58358.67,79512,11667.6
1,Jet Airways,1840,11643.92,54826,4258.94
2,Multiple carriers Premium economy,9845,11418.85,14629,1717.15
3,Multiple carriers,5797,10902.68,36983,3721.23
4,Air India,2050,9608.02,31945,3898.49
5,Vistara Premium economy,5969,8962.33,11793,2915.41
6,Vistara,3687,7796.35,21730,2914.3
7,GoAir,3398,5861.06,22794,2703.59
8,IndiGo,2227,5673.68,22153,2264.14
9,Air Asia,3383,5590.26,13774,2027.36


In [38]:
# groupby total_stops
result = sc.sql("SELECT Total_Stops,\
COUNT(Total_Stops)\
FROM dataframe \
GROUP BY Total_Stops \
ORDER BY COUNT(Total_Stops)\
DESC LIMIT 10 ").show()

+-----------+------------------+
|Total_Stops|count(Total_Stops)|
+-----------+------------------+
|     1 stop|              5625|
|   non-stop|              3491|
|    2 stops|              1519|
|    3 stops|                45|
|    4 stops|                 1|
+-----------+------------------+



In [39]:
# replace non-stop with 0 stops and remove stop and stops form the col

# df1 = df1.withColumn('Stops', when(df1.Total_Stops == 'non-stop', 0)
#                             .when(df1.Total_Stops == '1 stop', 1)
#                             .when(df1.Total_Stops == '2 stops', 2)
#                             .when(df1.Total_Stops == '3 stops', 3)
#                             .when(df1.Total_Stops == '4 stops', 4) )


In [40]:
df1.show()

+-----------------+--------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|          Airline|  Source|Destination|Total_Stops|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|
+-----------------+--------+-----------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+
|           IndiGo|Banglore|  New Delhi|   non-stop| 3897|  1|    3|           1|       22|      20|         10|           170|
|        Air India| Kolkata|   Banglore|    2 stops| 7662|  4|    5|          13|       05|      50|         15|           445|
|      Jet Airways|   Delhi|     Cochin|    2 stops|13882|  1|    6|           4|       09|      25|         25|          1140|
|           IndiGo| Kolkata|   Banglore|     1 stop| 6218|  1|    5|          23|       18|      05|         30|           325|
|           IndiGo|Banglore|  New Delhi|     1 stop|13302|  6|    3|          21|       16|      50|    

In [41]:
df1.createOrReplaceTempView('dataframe')

df1 = sc.sql("SELECT *,\
    CASE\
    WHEN Total_Stops = '1 stop' THEN 1 \
    WHEN Total_Stops = '2 stops' THEN 2 \
    WHEN Total_Stops = '3 stops' THEN 3 \
    WHEN Total_Stops = '4 stops' THEN 4 \
ELSE 0 END AS Stops FROM dataframe").drop('Total_Stops')

In [42]:
df1.show(3)

+-----------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+-----+
|    Airline|  Source|Destination|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|Stops|
+-----------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+-----+
|     IndiGo|Banglore|  New Delhi| 3897|  1|    3|           1|       22|      20|         10|           170|    0|
|  Air India| Kolkata|   Banglore| 7662|  4|    5|          13|       05|      50|         15|           445|    2|
|Jet Airways|   Delhi|     Cochin|13882|  1|    6|           4|       09|      25|         25|          1140|    2|
+-----------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+-----+
only showing top 3 rows



In [43]:
# price stats for different airlines with number of stops
stops = df1.select(['Airline','Price','Stops']).\
            groupBy('Airline','Stops').\
            agg(
                F.min('Price').alias('min_price'),
                F.format_number(mean('Price'),2).alias('mean_price'),
                F.max('Price').alias('max_price'),
                F.format_number(stddev('Price'),2).alias('std_price')
            ).sort(asc('Airline'))

In [44]:
stops.show(10)

+---------+-----+---------+----------+---------+---------+
|  Airline|Stops|min_price|mean_price|max_price|std_price|
+---------+-----+---------+----------+---------+---------+
| Air Asia|    0|     3383|  4,492.33|    10873| 1,161.58|
| Air Asia|    1|     5162|  7,078.33|    13774| 2,016.97|
| Air Asia|    2|     5192|  6,341.89|    11245| 2,077.51|
|Air India|    0|     2050|  5,605.33|    31945| 2,892.98|
|Air India|    1|     4227|  9,041.51|    28322| 3,189.03|
|Air India|    3|     8607| 12,208.05|    18293| 2,112.00|
|Air India|    4|    17686| 17,686.00|    17686|     null|
|Air India|    2|     4647| 12,085.85|    31783| 2,738.07|
|    GoAir|    0|     3398|  4,726.17|    18558| 2,207.45|
|    GoAir|    1|     3673|  6,884.68|    22794| 2,710.30|
+---------+-----+---------+----------+---------+---------+
only showing top 10 rows



In [45]:
# price stats for different airlines with number of stops in SQL
df1.createOrReplaceTempView('dataframe')
res = sc.sql(
            "SELECT Airline, Stops,\
                COUNT(Stops) as stop_count,MIN(Price) AS min_price,\
                MAX(Price) AS max_price, ROUND(AVG(Price),2) AS mean_price,\
                ROUND(STDDEV(Price),2) AS std_price\
            FROM dataframe\
            GROUP BY Airline, Stops\
                ORDER BY Airline\
            ASC"
             )

In [46]:
res.show(10)

+---------+-----+----------+---------+---------+----------+---------+
|  Airline|Stops|stop_count|min_price|max_price|mean_price|std_price|
+---------+-----+----------+---------+---------+----------+---------+
| Air Asia|    0|       181|     3383|    10873|   4492.33|  1161.58|
| Air Asia|    1|       129|     5162|    13774|   7078.33|  2016.97|
| Air Asia|    2|         9|     5192|    11245|   6341.89|  2077.51|
|Air India|    0|       417|     2050|    31945|   5605.33|  2892.98|
|Air India|    1|       540|     4227|    28322|   9041.51|  3189.03|
|Air India|    3|        37|     8607|    18293|  12208.05|   2112.0|
|Air India|    4|         1|    17686|    17686|   17686.0|     null|
|Air India|    2|       755|     4647|    31783|  12085.85|  2738.07|
|    GoAir|    0|        92|     3398|    18558|   4726.17|  2207.45|
|    GoAir|    1|       102|     3673|    22794|   6884.68|   2710.3|
+---------+-----+----------+---------+---------+----------+---------+
only showing top 10 

In [47]:
df1.show(3)

+-----------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+-----+
|    Airline|  Source|Destination|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|Stops|
+-----------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+-----+
|     IndiGo|Banglore|  New Delhi| 3897|  1|    3|           1|       22|      20|         10|           170|    0|
|  Air India| Kolkata|   Banglore| 7662|  4|    5|          13|       05|      50|         15|           445|    2|
|Jet Airways|   Delhi|     Cochin|13882|  1|    6|           4|       09|      25|         25|          1140|    2|
+-----------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+-----+
only showing top 3 rows



In [48]:
df1.createOrReplaceTempView('dataframe')

result = sc.sql(
            "SELECT Airline, Source, Destination, Price\
            FROM dataframe\
            WHERE\
            Price = (SELECT MAX(Price) AS max_price\
            FROM dataframe)")

result.show()

+--------------------+--------+-----------+-----+
|             Airline|  Source|Destination|Price|
+--------------------+--------+-----------+-----+
|Jet Airways Business|Banglore|  New Delhi|79512|
+--------------------+--------+-----------+-----+



In [49]:
# Extracting data for Jet Airways
df1.createOrReplaceTempView('dataframe')

result = sc.sql("SELECT * FROM dataframe WHERE Airline IN ('Jet Airways')")

In [50]:
result.show()

+-----------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+-----+
|    Airline|  Source|Destination|Price|day|month|Arrival_hour|dept_hour|dept_min|Arrival_min|Duration_(min)|Stops|
+-----------+--------+-----------+-----+---+-----+------------+---------+--------+-----------+--------------+-----+
|Jet Airways|   Delhi|     Cochin|13882|  1|    6|           4|       09|      25|         25|          1140|    2|
|Jet Airways|Banglore|  New Delhi|11087|  3|    3|          10|       18|      55|         25|           930|    1|
|Jet Airways|Banglore|  New Delhi|22270|  6|    3|           5|       08|      00|          5|          1265|    1|
|Jet Airways|Banglore|  New Delhi|11087|  3|    3|          10|       08|      55|         25|          1530|    1|
|Jet Airways| Kolkata|   Banglore| 9663|  5|    5|           9|       21|      10|         20|           730|    1|
|Jet Airways|   Delhi|     Cochin|10262|  4|    6|          12|       14

In [51]:
# descriptive stats for Jet Airways
result.createOrReplaceTempView('dataframe')

res = sc.sql(
            "SELECT Airline, Stops,\
                COUNT(Stops) as stop_count,MIN(Price) AS min_price,\
                MAX(Price) AS max_price, ROUND(AVG(Price),2) AS mean_price,\
                ROUND(STDDEV(Price),2) AS std_price\
            FROM dataframe\
            GROUP BY Airline, Stops\
                ORDER BY Airline\
            ASC"
             )

In [52]:
res.show()

+-----------+-----+----------+---------+---------+----------+---------+
|    Airline|Stops|stop_count|min_price|max_price|mean_price|std_price|
+-----------+-----+----------+---------+---------+----------+---------+
|Jet Airways|    1|      2535|     4757|    54826|  12512.74|  3957.63|
|Jet Airways|    2|       691|     6643|    24210|  13387.25|  2754.28|
|Jet Airways|    0|       623|     1840|    18308|   6175.08|  1892.08|
+-----------+-----+----------+---------+---------+----------+---------+



In [53]:
result.createOrReplaceTempView('dataframe')
res_new = sc.sql(
             "SELECT Airline, Stops,Source, Destination,\
                COUNT(Stops) as stop_count,MIN(Price) AS min_price,\
                MAX(Price) AS max_price, ROUND(AVG(Price),2) AS mean_price,\
                ROUND(STDDEV(Price),2) AS std_price\
            FROM dataframe\
            WHERE (Source = 'Banglore' AND (Destination = 'Delhi' OR Destination = 'New Delhi'))\
            GROUP BY Airline, Stops, Source, Destination\
                ORDER BY max_price\
            DESC"
             )

In [54]:
res_new.show()

+-----------+-----+--------+-----------+----------+---------+---------+----------+---------+
|    Airline|Stops|  Source|Destination|stop_count|min_price|max_price|mean_price|std_price|
+-----------+-----+--------+-----------+----------+---------+---------+----------+---------+
|Jet Airways|    1|Banglore|  New Delhi|       403|     5853|    54826|  15263.69|  7270.94|
|Jet Airways|    2|Banglore|  New Delhi|         3|    11245|    23001|  16861.33|  5895.45|
|Jet Airways|    0|Banglore|  New Delhi|        12|     7229|    18308|   9206.67|  4261.81|
|Jet Airways|    0|Banglore|      Delhi|       370|     3359|     8541|   6412.95|  1397.65|
+-----------+-----+--------+-----------+----------+---------+---------+----------+---------+



In [55]:
window = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

In [56]:
price_groubby = df1.select(['Airline','Price']).\
                groupBy('Airline').\
                agg(
                    F.count('Airline').alias('Airline_Count'),
                    F.mean('Price').alias('mea_price'),
                    F.min('Price').alias('min_price'),
                    F.max('Price').alias('max_Price')).\
                withColumn('total', sum(col('Airline_Count')).over(window)).\
                withColumn('Percent', col('Airline_Count')*100/col('total')).\
                drop(col('total')).sort(desc('Percent')
                )

In [57]:
price_groubby.toPandas()

Unnamed: 0,Airline,Airline_Count,mea_price,min_price,max_Price,Percent
0,Jet Airways,3849,11643.923357,1840,54826,36.035952
1,IndiGo,2053,5673.682903,2227,22153,19.221047
2,Air India,1750,9608.019429,2050,31945,16.384234
3,Multiple carriers,1196,10902.678094,5797,36983,11.197453
4,SpiceJet,818,4338.284841,1759,23267,7.658459
5,Vistara,479,7796.348643,3687,21730,4.484599
6,Air Asia,319,5590.260188,3383,13774,2.986612
7,GoAir,194,5861.056701,3398,22794,1.816309
8,Multiple carriers Premium economy,13,11418.846154,9845,14629,0.121711
9,Jet Airways Business,6,58358.666667,46490,79512,0.056175


In [58]:
df1.createOrReplaceTempView('dataframe')

newRes = sc.sql("SELECT Airline,COUNT(Airline) as AirlineCount,\
                    MIN(Price) AS min_price,\
                    ROUND(MEAN(Price),2) AS mean_Price,\
                    MAX(Price) AS max_price \
                    FROM dataframe\
                    GROUP BY Airline\
                    ORDER BY AirlineCount\
                    DESC")

In [59]:
newRes.show()

+--------------------+------------+---------+----------+---------+
|             Airline|AirlineCount|min_price|mean_Price|max_price|
+--------------------+------------+---------+----------+---------+
|         Jet Airways|        3849|     1840|  11643.92|    54826|
|              IndiGo|        2053|     2227|   5673.68|    22153|
|           Air India|        1750|     2050|   9608.02|    31945|
|   Multiple carriers|        1196|     5797|  10902.68|    36983|
|            SpiceJet|         818|     1759|   4338.28|    23267|
|             Vistara|         479|     3687|   7796.35|    21730|
|            Air Asia|         319|     3383|   5590.26|    13774|
|               GoAir|         194|     3398|   5861.06|    22794|
|Multiple carriers...|          13|     9845|  11418.85|    14629|
|Jet Airways Business|           6|    46490|  58358.67|    79512|
|Vistara Premium e...|           3|     5969|   8962.33|    11793|
|              Trujet|           1|     4140|    4140.0|     4