### Project Name: Flight Prediction Analysis

### Project Objective:
##### The goal of this project is to analyse and clean the data so that we can use this to predict flight prices based on a variety of variables. The number of people who fly has dramatically increased in recent years. Pricing alters dynamically owing to many variables, making it difficult for airlines to maintain prices.As a result, we will attempt to solve this problem by cleaning and analysing the data which will be used by machine learning models in flight fare predictions.

### Analysis Involve Phases:
1. Data collection
2. Data Cleaning and Preparation

In [4]:
# importing Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Data Collection/Reading

In [5]:
final_df = pd.read_csv("flight_prediction.csv")

### Data Understanding

In [9]:
final_df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [10]:
final_df = final_df.rename(columns={'Airline':'airline', 
                         'Date_of_Journey': 'date_of_journey', 
                         'Source': 'source', 
                         'Destination': 'destination', 
                         'Route': 'route',
                         'Dep_Time': 'dep_time', 
                         'Arrival_Time': 'arrival_time', 
                         'Duration':'duration', 
                         'Total_Stops': 'total_stops', 
                         'Additional_Info': 'additional_info',
                         'Price': 'price' 
                         })

In [11]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13354 entries, 0 to 13353
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          13354 non-null  object 
 1   date_of_journey  13354 non-null  object 
 2   source           13354 non-null  object 
 3   destination      13354 non-null  object 
 4   route            13353 non-null  object 
 5   dep_time         13354 non-null  object 
 6   arrival_time     13354 non-null  object 
 7   duration         13354 non-null  object 
 8   total_stops      13353 non-null  object 
 9   additional_info  13354 non-null  object 
 10  price            10683 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.1+ MB


In [8]:
final_df.shape

(13354, 11)

### Observations:
- Number of variables = 11
- Number of rows = 13354
- Number of categorical type of feature = 10
- Number of numerical type of feature = 1

### Data Preparation/Cleaning

In [10]:
def data_cleaning(dataframe):
    
    
    # By using lambda function -  split date_of_journey into date, month & year
    dataframe['Date']= dataframe['Date_of_Journey'].apply(lambda x:x.split("/")[0])
    dataframe['Month']= dataframe['Date_of_Journey'].apply(lambda x:x.split("/")[1])
    dataframe['Year']= dataframe['Date_of_Journey'].apply(lambda x:x.split("/")[2])
    
    # or dataframe['Date']=dataframe['Date_of_Journey'].str.split('/').str[0]
    # or dataframe['Date']=dataframe['Date_of_Journey'].str.split('/').str[0]
    # or dataframe['Year']=dataframe['Date_of_Journey'].str.split('/').str[2]
    
    print("Split of date_of_journey - done")
    
    # To convert date Month and Year column to integer
    dataframe['Date']=dataframe['Date'].astype(int)
    dataframe['Month']=dataframe['Month'].astype(int)
    dataframe['Year']=dataframe['Year'].astype(int)
    print("Convert date, month and year into integer - done")
    
    # Dropping of Date_of_Journey column
    dataframe.drop('Date_of_Journey',axis=1,inplace=True)
    
    # Splitting Arrival Time column
    dataframe['Arrival_Time'].str.split(' ')
    
    # Here we have to focus on time not the date
    
    dataframe['Arrival_Time']=final_df['Arrival_Time'].apply(lambda x:x.split(' ')[0])
    
    # To split arrival time in hour and minutes
    dataframe['Arrival_Hour']=dataframe['Arrival_Time'].str.split(':').str[0]
    dataframe['Arrival_Min']=dataframe['Arrival_Time'].str.split(':').str[1]

    # To change 'Arrival_Min' and 'Arrival_Hour' in integer Data type
    dataframe['Arrival_Hour']=dataframe['Arrival_Hour'].astype(int)
    dataframe['Arrival_Min']=dataframe['Arrival_Min'].astype(int)

    # To drop Arrival_Time column
    dataframe.drop('Arrival_Time',axis=1,inplace=True)

    # To change Departure time by splitting time in hour and minutes

    dataframe['Dept_Hour']=dataframe['Dep_Time'].str.split(':').str[0]
    dataframe['Dept_Min']=dataframe['Dep_Time'].str.split(':').str[1]

    # To change 'Dept_Hour' and 'Dept_min' in integer Data type
    dataframe['Dept_Hour']=dataframe['Dept_Hour'].astype(int)
    dataframe['Dept_Min']=dataframe['Dept_Min'].astype(int)

    # To drop Dep_Time column
    dataframe.drop('Dep_Time',axis=1,inplace=True)

    # Splitting Hours  from Duration column
    dataframe['Duration_hour']=dataframe['Duration'].str.split(' ').str[0].str.split('h').str[0]

    # We have '5m' in our dataset
    dataframe[final_df['Duration_hour']=='5m']

     # Dropping 2 rows
    dataframe.drop(6474,axis=0,inplace=True)
    dataframe.drop(2660,axis=0,inplace=True)
    # Above records are incorrect because Mumbai to Hyderabad it is showing as 5 minutes which is inconsitent or not correct hence we are dropping above 2 rows
    
     return dataframe

In [None]:
def data_preperation(dataframe)
   # Splitting Minutes from duration-(Data Preperation)
     final_df['Duration_min']=final_df['Duration'].str.split(' ').str[1].str.split('m').str[0]
        
    # We have replaced Nan values with 0
      final_df['Duration_min']=final_df['Duration_min'].fillna(0)
        
    # To check null values
     final_df['Duration_min'].isnull().sum()
        
    # Dropping Duration column
     final_df.drop('Duration',axis=1,inplace=True)
        
    # In route we have one Nan value to see which record is Nan we can use below code:
     final_df[final_df['Total_Stops'].isnull()]
        
    # To analyse total stops column
     final_df['Total_Stops'].unique()
        
    recode = {'non-stop':0,'1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4, 'nan':1}
   
    final_df['Total_Stops'] = final_df['Total_Stops'].replace(recode)
    
    final_df['Total_Stops'].unique()
    
    # Filling Null values
    final_df['Total_Stops'] = final_df['Total_Stops'].fillna(1)
    
    final_df['Total_Stops'].unique()
    
    # To drop Dep_Time column
    final_df.drop('Route',axis=1,inplace=True)
    
    # To handle categorical variable
    
    # To check unique values in below columns
    final_df['Airline'].unique()
    final_df['Source'].unique()
    final_df['Destination'].unique()
    final_df['Additional_Info'].unique()
    
    # Here we can use label encoder for the above columns to convert categorical feature into numerical feature
    
    from sklearn.preprocessing import LabelEncoder
    
    # Create object of class LabelEncoder
    labelencoder=LabelEncoder()
    
    final_df['Airline']=labelencoder.fit_transform(final_df['Airline'])
    final_df['Source']=labelencoder.fit_transform(final_df['Source'])
    final_df['Destination']=labelencoder.fit_transform(final_df['Destination'])
    final_df['Additional_Info']=labelencoder.fit_transform(final_df['Additional_Info'])
    
    # Here we can see all columns are converted into object and float data type
    final_df.info()
        
        
        
        
        
        
        