[![Open In Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/jia-wei-teh/flight-fare-prediction/blob/master/unfilled_demo.ipynb) 


In [None]:
import numpy as np
import pandas as pd

# plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

# handle warnings
import warnings
warnings.filterwarnings('ignore')

### Import Dataset

In [None]:
# read dataset from csv file


# Can also use .read_excel(). To use read_excel, it requires openpyxl (make sure you have it installed)
# df = pd.read_excel('flights_dataset.xlsx') # uncomment if you want to use read_excel

In [None]:
# inspecting the data (first 5 rows)


### Data Inspection (EDA)

In [None]:
# inspecting the columns in dataframe


In [None]:
# inspect info about dataframe


In [None]:
# check for missing values


In [None]:
# check rows and columns (shape)
# ( # rows, # columns)

In [None]:
# drop all rows with NaN values
# df = df.dropna() # uncomment if you want to use re-assignment


In [None]:
# inspect info about dataframe again


### Feature Engineering


In [None]:
# view first 5 rows


In [None]:
# get metadata (day, month, day_of_week) from the column "Date_of_Journey"


In [None]:
# Quick demo on how `to_datetime` works
demo_date = pd.to_datetime("31/08/1957", format="%d/%m/%Y")

demo_date.day # day of month 
demo_date.month # month     
demo_date.year # year
demo_date.day_of_week # Monday is 0, Sunday is 6
demo_date.day_name() # day of week (string)

In [None]:
# drop the column Date_of_Journey


In [None]:
# Similar to Date_of_Journey we can extract values (hour, minute) from Dep_Time
# Extracting Hours


In [None]:
# drop Dep Time column


In [None]:
# inspect current dataframe with .head()


In [None]:
# create a function to convert the Dep_Time column to minutes
def convert_duration_to_minute(duration_string):
    """
    Converts a duration string to minutes
    """
    if duration_string == "":
        return 0
    else:
        duration_list = duration_string.split(" ") # split the string by space [ "1h", "30m" ]

        if len(duration_list) != 2: 
            duration_list.append("0m") if "h" in duration_list[0] else duration_list.insert(0, "0h")

        hours = int(duration_list[0].split("h")[0])
        minutes = int(duration_list[1].split("m")[0])

        return hours*60 + minutes

In [None]:
# unit tests to ensure the function works
assert convert_duration_to_minute("2h 30m") == 150
assert convert_duration_to_minute("1h 30m") == 90
assert convert_duration_to_minute("1h") == 60   
assert convert_duration_to_minute("30m") == 30
assert convert_duration_to_minute("") == 0 

# assert convert_duration_to_minute("1h 30m") == 60 # False assertion (uncomment to see error raised)

In [None]:
# convert the column Dep_Time to minutes (new column Duration_min)
# using .apply

# list comprehension
# df["Duration_min"] = [ convert_duration_to_minute(duration) for duration in df["Duration"] ] 

In [None]:
# drop Duration column


In [None]:
# head


### Handling Categorical Data

In [None]:
# count the number of occurences of each airline


In [None]:
# Quick plot price distribution for each airline 


In [None]:
# convert Airline to dummy variables (OnehotEncoding)


In [None]:
# determine count of each "Source"


In [None]:
# plot distribution of price by Source


In [None]:
# convert Source to dummy variables


In [None]:
# convert Destination to dummy variables


In [None]:
# inspect Additional_info from dataframe 


# we observed majority of Additional_Info is 'No info'
# Route is also too complicated to be useful (correlated to TOTAL_STOPS)


In [None]:
# drop the columns "Airline", "Source", "Destination", "Additional_Info", "Route"



In [None]:
# head 


In [None]:
# check out sample values (unique) in Total_Stops


In [None]:
# using .map to convert Total_Stops to count of stops (int) - N_stop
# mapping used: {"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}


In [None]:
# drop the columns "Total_Stops", "Arrival_Time"


In [None]:
# concatenate the dataframes (df , airlines, source, destination)


# and check out the dataframe


In [None]:
# Allows us to see more information regarding the DataFrame
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

### Handling Outliers (Subjective)

In [None]:
# boxplot on target variable- Price


In [None]:
# if price > 40k, then assign to median price


# using list comprehension to assign median price to all prices > 40k


In [None]:
# boxplot for Target column (expected to see now more values > 40k )


In [None]:
# drop Price column


In [None]:
# (Optional) Write dataframe into a csv file

 

In [None]:
# see all data


### Modelling

In [None]:
# split data into train and test 
# test_size: proportion of data to be used for testing
# random_state: seed for random number generator

from sklearn.model_selection import train_test_split


In [None]:
## Linear Regression
from sklearn.linear_model import LinearRegression

# instantiate the model and fit the model

# calc training score


# print training score


In [None]:
# Evaluation of model
from sklearn.metrics import r2_score



In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# print('MAE: {}'.format(mean_absolute_error(y_test, y_pred)))
# print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
# print('RSME: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))

In [None]:
# combine into a function with model as input 
def train_and_predict(model, X_train, X_test, y_train, y_test):
    """
    Train and predict using a given  model
    """

    print("Model: ", model)
    model.fit(X_train, y_train)

    print("Training Score: ", model.score(X_train, y_train))

    # predict on test data and it's metrics
    predict_with_metrics(model, X_test, y_test)


def predict_with_metrics(model, X_test, y_test):
    """
    Predict on test data and print metrics
    """

    y_pred = model.predict(X_test)
    r2score = r2_score(y_test, y_pred)
    print("R2 Score: ", r2score)

    print('MAE: {}'.format(mean_absolute_error(y_test, y_pred)))
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('RSME: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))


In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor



In [None]:
# get feature importance
feat_importance = pd.Series(reg_rf.feature_importances_ , index = X_train.columns)

# plot top 20 features


In [None]:
## XGBoost Regressor 
from xgboost import XGBRegressor


In [None]:
# plot feature importance (with xgboost)
from xgboost import plot_importance 



### Hyperparameter Tuning

Generally there's two types of hyperparameter tuning methods: 
- RandomizedSearchCV (Faster)
- GridSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV 

param_grid= {
    'n_estimators': [100, 200, 300],
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.1, 1.0],
    'subsample': [ 0.5, 0.75 , 0.9] , 
    'colsample_bytree': [ 0.5, 0.75, 0.9 ] ,
}

# estimator : The estimator being fit, here, it's the XGBoost
# param_distribution : distributoon of the possible hyper-param
# cv : number of cross-validation. iteration
# n_iter : number of hyperparam combination to choose from
# verbose: (2) print more output 

# xgb_model_tuned = RandomizedSearchCV( .... ))

In [None]:
# fit the model to the training data (with hyperparameters)


In [None]:
# view best param based on the combination above


In [None]:
# predict on test data and it's metrics
