### 1. Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### 2. Reading the Data

In [2]:
proj_dir = r"Z:\AIML Projects\sagemaker-flight-prices-prediction"
data_dir = r"data"

In [3]:
def get_data(name):
    file_name = f'{name}.csv'
    file_path = os.path.join(proj_dir,data_dir,file_name)
    return pd.read_csv(file_path)

In [4]:
flights_data = get_data('flight_price')
flights_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [5]:
flights_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


- The dataset contains 10,683 rows and 11 features
- Col `Route` & `Total_Stops` have missing value each
- The data types of some features isn't appropriate

### 3. Preliminary Analysis

#### 3.1 Check Data Types

In [6]:
flights_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [7]:
flights_data.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [8]:
a_random_value = flights_data.Date_of_Journey.iloc[6]
a_random_value

'12/03/2019'

In [9]:
a_random_value = flights_data.Dep_Time.iloc[6]
a_random_value

'18:55'

In [10]:
a_random_value = flights_data.Arrival_Time.iloc[6]
a_random_value

'10:25 13 Mar'

#### 3.2 Check for Duplicates

In [11]:
flights_data.duplicated().sum()

np.int64(220)

In [12]:
(
    flights_data
    .loc[flights_data.duplicated(keep=False)]
    .sort_values(['Airline','Date_of_Journey','Source','Destination'])
).iloc[:]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6321,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
9848,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
572,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
8168,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
1495,Air India,1/04/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,01:20 02 Apr,15h 20m,2 stops,No info,10408
...,...,...,...,...,...,...,...,...,...,...,...
2692,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
2870,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
3711,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,20:30,23:20,2h 50m,non-stop,No check-in baggage included,3873
2634,Vistara,24/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,5403


#### 3.3 Observations

- The `Date_of_Journey`,`Dep_Time`,`Arrival_Time`is a string but it should be DateTime datatype
- The type of `Duration` should be either in min or hour choose one unit
- The `Total_Stops` is also mixed it should be {0,1,2 etc}

### 4. Detailed Analysis

#### Airline Col

In [13]:
flights_data.Airline.unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

- Some pf names of airlines have their class being included with the name of the airline therefore this should be resolved `'Jet Airways','Multiple carriers','Vistara','Vistara Premium economy', 'Jet Airways Business','Multiple carriers Premium economy'`

In [14]:
(
    flights_data
    .Airline
    .str.replace(' Premium economy','')
    .str.replace(' Business','')
    .str.title()
    .unique()
)

array(['Indigo', 'Air India', 'Jet Airways', 'Spicejet',
       'Multiple Carriers', 'Goair', 'Vistara', 'Air Asia', 'Trujet'],
      dtype=object)

In [15]:
(
    flights_data
    .Airline
    .str.replace(' Premium economy','')
    .str.replace(' Business','')
    .str.title()
)

0             Indigo
1          Air India
2        Jet Airways
3             Indigo
4             Indigo
            ...     
10678       Air Asia
10679      Air India
10680    Jet Airways
10681        Vistara
10682      Air India
Name: Airline, Length: 10683, dtype: object

#### Date_of_Journey Col

In [16]:
pd.to_datetime(flights_data.Date_of_Journey, dayfirst=True)

0       2019-03-24
1       2019-05-01
2       2019-06-09
3       2019-05-12
4       2019-03-01
           ...    
10678   2019-04-09
10679   2019-04-27
10680   2019-04-27
10681   2019-03-01
10682   2019-05-09
Name: Date_of_Journey, Length: 10683, dtype: datetime64[ns]

#### Source Col

In [17]:
flights_data.Source.unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [18]:
(
    flights_data
    .Source
    .str.replace('Delhi','New Delhi')
)

0         Banglore
1          Kolkata
2        New Delhi
3          Kolkata
4         Banglore
           ...    
10678      Kolkata
10679      Kolkata
10680     Banglore
10681     Banglore
10682    New Delhi
Name: Source, Length: 10683, dtype: object

#### Destination Col

In [19]:
flights_data.Destination.unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

#### Dep_Time Col

In [20]:
flights_data.Dep_Time

0        22:20
1        05:50
2        09:25
3        18:05
4        16:50
         ...  
10678    19:55
10679    20:45
10680    08:20
10681    11:30
10682    10:55
Name: Dep_Time, Length: 10683, dtype: object

In [21]:
(
    flights_data
    .Dep_Time
    # .loc[lambda ser: ser.str.contains("[0-9:]")]      without the carat it gives us which contains the 0 to 9 and : too  
    .loc[lambda ser: ser.str.contains("[^0-9:]")]    # with a carat it gives which doesn't contain 0 to 9 and : 
)

Series([], Name: Dep_Time, dtype: object)

- Series([], Name: Dep_Time, dtype: object)
this output means that it no element is there which is having anything apart from the 0 to 9 and :

In [22]:
pd.to_datetime(flights_data.Dep_Time,format="%H:%M")

0       1900-01-01 22:20:00
1       1900-01-01 05:50:00
2       1900-01-01 09:25:00
3       1900-01-01 18:05:00
4       1900-01-01 16:50:00
                ...        
10678   1900-01-01 19:55:00
10679   1900-01-01 20:45:00
10680   1900-01-01 08:20:00
10681   1900-01-01 11:30:00
10682   1900-01-01 10:55:00
Name: Dep_Time, Length: 10683, dtype: datetime64[ns]

In [23]:
pd.to_datetime(flights_data.Dep_Time,format="%H:%M").dt.time

0        22:20:00
1        05:50:00
2        09:25:00
3        18:05:00
4        16:50:00
           ...   
10678    19:55:00
10679    20:45:00
10680    08:20:00
10681    11:30:00
10682    10:55:00
Name: Dep_Time, Length: 10683, dtype: object

#### Arrival_Time Col

In [24]:
flights_data.Arrival_Time

0        01:10 22 Mar
1               13:15
2        04:25 10 Jun
3               23:30
4               21:35
             ...     
10678           22:25
10679           23:20
10680           11:20
10681           14:10
10682           19:15
Name: Arrival_Time, Length: 10683, dtype: object

In [25]:
(
    flights_data
    .Arrival_Time
    .loc[lambda ser: ser.str.contains("[^0-9:]")]
    .str.split(" ")
)

0        [01:10, 22, Mar]
2        [04:25, 10, Jun]
6        [10:25, 13, Mar]
7        [05:05, 02, Mar]
8        [10:25, 13, Mar]
               ...       
10666    [19:00, 13, Jun]
10667    [20:20, 13, Mar]
10672    [19:00, 28, Jun]
10673    [04:25, 28, May]
10674    [21:20, 13, Mar]
Name: Arrival_Time, Length: 4335, dtype: object

In [26]:
(
    flights_data
    .Arrival_Time
    .loc[lambda ser: ser.str.contains("[^0-9:]")]
    .str.split(" ",n=1)
    .str.get(0)
)

0        01:10
2        04:25
6        10:25
7        05:05
8        10:25
         ...  
10666    19:00
10667    20:20
10672    19:00
10673    04:25
10674    21:20
Name: Arrival_Time, Length: 4335, dtype: object

#### Duration Col

In [27]:
flights_data.Duration

0        2h 50m
1        7h 25m
2           19h
3        5h 25m
4        4h 45m
          ...  
10678    2h 30m
10679    2h 35m
10680        3h
10681    2h 40m
10682    8h 20m
Name: Duration, Length: 10683, dtype: object

In [28]:
(
    flights_data
    .Duration
    .loc[lambda ser: ~ser.str.contains("m")]
    .unique()
)

array(['19h', '23h', '22h', '12h', '3h', '5h', '10h', '18h', '24h', '15h',
       '16h', '8h', '14h', '20h', '13h', '11h', '9h', '27h', '26h', '4h',
       '7h', '30h', '21h', '28h', '47h', '6h', '25h', '38h', '34h'],
      dtype=object)

In [29]:
(
    flights_data
    .Duration
    .loc[lambda ser: ~ser.str.contains("h")]
    .unique()
)

array(['5m'], dtype=object)

In [30]:
(
    flights_data
    .Duration
    .loc[lambda ser: ~ser.str.contains("h")]
    # .unique()
)

6474    5m
Name: Duration, dtype: object

- The observation indexed 6474 has duration of 5 min. This seems wrong. Will delete this observation

In [31]:
(
    flights_data
    .Duration
    .str.split(" ")
)

0        [2h, 50m]
1        [7h, 25m]
2            [19h]
3        [5h, 25m]
4        [4h, 45m]
           ...    
10678    [2h, 30m]
10679    [2h, 35m]
10680         [3h]
10681    [2h, 40m]
10682    [8h, 20m]
Name: Duration, Length: 10683, dtype: object

In [32]:
(
    flights_data
    .Duration
    .str.split(" ",expand=True)
)

Unnamed: 0,0,1
0,2h,50m
1,7h,25m
2,19h,
3,5h,25m
4,4h,45m
...,...,...
10678,2h,30m
10679,2h,35m
10680,3h,
10681,2h,40m


In [33]:
(
    flights_data
    .Duration
    .str.split(" ",expand=True)
    .set_axis(["hour","minutes"],axis=1)
)

Unnamed: 0,hour,minutes
0,2h,50m
1,7h,25m
2,19h,
3,5h,25m
4,4h,45m
...,...,...
10678,2h,30m
10679,2h,35m
10680,3h,
10681,2h,40m


In [34]:
(
    flights_data
    .Duration
    .str.split(" ",expand=True)
    .set_axis(["hour","minutes"],axis=1)
    .assign(
        hour=lambda df_:(
            df_
            .hour
            .str.replace("h","")
        ),
        minutes=lambda df_:(
            df_
            .minutes
            .str.replace("m","")
        )
    )
)

Unnamed: 0,hour,minutes
0,2,50
1,7,25
2,19,
3,5,25
4,4,45
...,...,...
10678,2,30
10679,2,35
10680,3,
10681,2,40


In [35]:
(
    flights_data
    .Duration
    .str.split(" ",expand=True)
    .set_axis(["hour","minutes"],axis=1)
    .assign(
        hour=lambda df_:(
            df_
            .hour
            .str.replace("h","")
        ),
        minutes=lambda df_:(
            df_
            .minutes
            .str.replace("m","")
        )
    )
    .isna().sum()
)

hour          0
minutes    1032
dtype: int64

In [36]:
(
    flights_data
    .Duration
    .drop(index=6474)
    .str.split(" ", expand=True)
    .set_axis(["hour","minutes"], axis=1)
    .assign(
        hour=lambda df_:(
            df_
            .hour
            .str.replace("h","")
            .astype(int)
            .mul(60)
        ),
        minutes=lambda df_:(
            df_
            .minutes
            .str.replace("m","")
            .fillna("0")
            .astype(int)
        )
    )
)

Unnamed: 0,hour,minutes
0,120,50
1,420,25
2,1140,0
3,300,25
4,240,45
...,...,...
10678,120,30
10679,120,35
10680,180,0
10681,120,40


In [37]:
(
    flights_data
    .Duration
    .drop(index=6474)
    .str.split(" ", expand=True)
    .set_axis(["hour","minutes"], axis=1)
    .assign(
        hour=lambda df_:(
            df_
            .hour
            .str.replace("h","")
            .astype(int)
            .mul(60)
        ),
        minutes=lambda df_:(
            df_
            .minutes
            .str.replace("m","")
            .fillna("0")
            .astype(int)
        )
    )
    .dtypes
)

hour       int64
minutes    int64
dtype: object

In [38]:
(
    flights_data
    .Duration
    .drop(index=6474)
    .str.split(" ", expand=True)
    .set_axis(["hour","minutes"], axis=1)
    .assign(
        hour=lambda df_:(
            df_
            .hour
            .str.replace("h","")
            .astype(int)
            .mul(60)
        ),
        minutes=lambda df_:(
            df_
            .minutes
            .str.replace("m","")
            .fillna("0")
            .astype(int)
        )
    )
    .sum(axis=1)
    .rename("duration_minutes")
)

0         170
1         445
2        1140
3         325
4         285
         ... 
10678     150
10679     155
10680     180
10681     160
10682     500
Name: duration_minutes, Length: 10682, dtype: int64

In [39]:
(
    flights_data
    .Duration
    .drop(index=6474)
    .str.split(" ", expand=True)
    .set_axis(["hour","minutes"], axis=1)
    .assign(
        hour=lambda df_:(
            df_
            .hour
            .str.replace("h","")
            .astype(int)
            .mul(60)
        ),
        minutes=lambda df_:(
            df_
            .minutes
            .str.replace("m","")
            .fillna("0")
            .astype(int)
        )
    )
    .sum(axis=1)
    .rename("duration_minutes")
    .to_frame()
    .join(flights_data.Duration.drop(index=6474))
)

Unnamed: 0,duration_minutes,Duration
0,170,2h 50m
1,445,7h 25m
2,1140,19h
3,325,5h 25m
4,285,4h 45m
...,...,...
10678,150,2h 30m
10679,155,2h 35m
10680,180,3h
10681,160,2h 40m


#### Total_Stops Col

In [40]:
(
    flights_data
    .Total_Stops
    .replace("non-stop",'0')
    .str.replace(" stops?","",regex=True) # " stop" " stops"  since the s after stop is optional for regex command
    .unique()
)

array(['0', '2', '1', '3', nan, '4'], dtype=object)

- nan is a float value it can't be converted to the integer this easy therefore will use pipe()

In [41]:
(
    flights_data
    .Total_Stops
    .replace("non-stop",'0')
    .str.replace(" stops?","",regex=True) # " stop" " stops"  since the s after stop is optional for regex command
    .pipe(lambda ser: pd.to_numeric(ser))
)

0        0.0
1        2.0
2        2.0
3        1.0
4        1.0
        ... 
10678    0.0
10679    0.0
10680    0.0
10681    0.0
10682    2.0
Name: Total_Stops, Length: 10683, dtype: float64

#### Additional_Info Col

In [42]:
flights_data.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

- Here the `No info` and `No Info` typo

### 5 Cleaning Operations

In [43]:
def convert_htom(ser):
    return (
    ser
    .str.split(" ", expand=True)
    .set_axis(["hour","minutes"], axis=1)
    .assign(
        hour=lambda df_:(
            df_
            .hour
            .str.replace("h","")
            .astype(int)
            .mul(60)
        ),
        minutes=lambda df_:(
            df_
            .minutes
            .str.replace("m","")
            .fillna("0")
            .astype(int)
        )
    )
    .sum(axis=1)
)

In [44]:
def clean_data(df):
    #this approach is called method chaining returns the copy doesn't change the original stuff
    return(
        df
        .drop(index=[6474])
        .drop_duplicates()
        # strip method removes leading or trailing whitespaces
        # assign creates or updates 
        .assign(**{
            col: df[col].str.strip()
            for col in df.select_dtypes(include='O').columns
        })
        .rename(columns=str.lower)
        .assign(
            airline=lambda df_:(
                df_
                .airline
                .str.replace(' Premium economy','')
                .str.replace(' Business','')
                .str.title()       
            ),  
            date_of_journey= lambda df_: pd.to_datetime(df_.date_of_journey, dayfirst=True),
            source = lambda df_: (
                df_
                .source
                .str.replace('Delhi','New Delhi')
            ),
            destination = lambda df_: (
                df_
                .destination
                .str.replace('New Delhi','Delhi')
                .str.replace('Delhi','New Delhi')
            ),
            dep_time = lambda df_:pd.to_datetime(df_.dep_time,format="%H:%M").dt.time,
            arrival_time = lambda df_:pd.to_datetime(df_.arrival_time).dt.time,
            duration = lambda df_: df_.duration.pipe(convert_htom),
            total_stops=lambda df_:(
                df_
                .total_stops
                .replace("non-stop","0")
                .str.replace(" stops?","", regex=True)
                .pipe(lambda ser: pd.to_numeric(ser))
            ),
            additional_info = lambda df_:(
                df_
                .additional_info
                .replace("No info","No Info")
            )
            
        )
        # route is kind of redundant
        .drop(columns='route')
    )
    

In [45]:
flights_data_cleaned = clean_data(flights_data)
flights_data_cleaned

  arrival_time = lambda df_:pd.to_datetime(df_.arrival_time).dt.time,


Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-03-24,Banglore,New Delhi,22:20:00,01:10:00,170,0.0,No Info,3897
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,445,2.0,No Info,7662
2,Jet Airways,2019-06-09,New Delhi,Cochin,09:25:00,04:25:00,1140,2.0,No Info,13882
3,Indigo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,325,1.0,No Info,6218
4,Indigo,2019-03-01,Banglore,New Delhi,16:50:00,21:35:00,285,1.0,No Info,13302
...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,2019-04-09,Kolkata,Banglore,19:55:00,22:25:00,150,0.0,No Info,4107
10679,Air India,2019-04-27,Kolkata,Banglore,20:45:00,23:20:00,155,0.0,No Info,4145
10680,Jet Airways,2019-04-27,Banglore,New Delhi,08:20:00,11:20:00,180,0.0,No Info,7229
10681,Vistara,2019-03-01,Banglore,New Delhi,11:30:00,14:10:00,160,0.0,No Info,12648


In [46]:
clean_data(flights_data).dtypes

  arrival_time = lambda df_:pd.to_datetime(df_.arrival_time).dt.time,


airline                    object
date_of_journey    datetime64[ns]
source                     object
destination                object
dep_time                   object
arrival_time               object
duration                    int64
total_stops               float64
additional_info            object
price                       int64
dtype: object

### 6. Split the Data

In [47]:
flights_final_data = flights_data_cleaned.sample(1000)

In [48]:
X = flights_final_data.drop(columns="price")
y = flights_final_data.price.copy()

In [49]:
X_,X_test,y_,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_,y_,test_size=0.2,random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape,y_val.shape)
print(X_test.shape,y_test.shape)

(640, 9) (640,)
(160, 9) (160,)
(200, 9) (200,)


### 7. Export the Subsets

In [50]:
def export_data(X,y,name):
    file_name = f"{name}.csv"
    file_path = os.path.join(proj_dir,data_dir,file_name)

    X.join(y).to_csv(file_path, index=False)

    return pd.read_csv(file_path).head()

In [51]:
export_data(X_train,y_train,"train")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-06,Kolkata,Banglore,08:25:00,04:40:00,1215,1.0,In-flight meal not included,8586
1,Jet Airways,2019-06-09,Kolkata,Banglore,21:10:00,09:20:00,730,1.0,In-flight meal not included,11841
2,Jet Airways,2019-05-21,New Delhi,Cochin,21:50:00,04:25:00,395,1.0,No Info,16079
3,Jet Airways,2019-03-18,Banglore,New Delhi,05:45:00,18:25:00,760,1.0,No Info,12284
4,Multiple Carriers,2019-05-27,New Delhi,Cochin,07:10:00,22:30:00,920,1.0,No Info,7878


In [52]:
export_data(X_val,y_val,"val")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-03-09,New Delhi,Cochin,10:55:00,19:15:00,1940,2.0,No Info,17476
1,Indigo,2019-06-03,New Delhi,Cochin,16:00:00,21:00:00,300,1.0,No Info,6593
2,Air Asia,2019-05-12,Kolkata,Banglore,07:35:00,23:30:00,955,1.0,No Info,5320
3,Jet Airways,2019-03-09,New Delhi,Cochin,13:45:00,18:50:00,1745,2.0,No Info,18752
4,Spicejet,2019-05-18,Kolkata,Banglore,14:55:00,17:25:00,150,0.0,No check-in baggage included,3841


In [53]:
export_data(X_test,y_test,"test")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-06,Banglore,New Delhi,19:50:00,22:50:00,180,0.0,In-flight meal not included,7229
1,Air Asia,2019-04-09,Kolkata,Banglore,06:50:00,10:30:00,220,1.0,No Info,5162
2,Indigo,2019-06-12,Banglore,New Delhi,06:05:00,08:50:00,165,0.0,No Info,3943
3,Indigo,2019-03-21,New Delhi,Cochin,14:30:00,17:35:00,185,0.0,No Info,5406
4,Multiple Carriers,2019-03-09,New Delhi,Cochin,07:00:00,19:45:00,765,1.0,No Info,15237
