In [None]:
import numpy as np
import pandas as pd

# plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

# handle warnings
import warnings
warnings.filterwarnings('ignore')

### Import Dataset

In [None]:
# read dataset from csv file
df = pd.read_csv('flights_dataset.csv')

# Can also use .read_excel(). To use read_excel, it requires openpyxl (make sure you have it installed)
# df = pd.read_excel('flights_dataset.xlsx') # uncomment if you want to use read_excel

In [None]:
# inspecting the data (first 5 rows)
df.head()

### Data Inspection (EDA)

In [None]:
# inspecting the columns in dataframe
df.columns

In [None]:
# inspect info about dataframe
df.info()

In [None]:
# check for missing values
df.isnull().sum() 

In [None]:
# check rows and columns (shape)
df.shape # ( # rows, # columns)

In [None]:
# drop all rows with NaN values
# df = df.dropna() # uncomment if you want to use re-assignment
df.dropna(inplace= True)

In [None]:
# inspect info about dataframe again
df.info()

### Feature Engineering


In [None]:
# view first 5 rows
df.head()

In [None]:
# get metadata (day, month, day_of_week) from the column "Date_of_Journey"
df["Journey_day"] = pd.to_datetime(df.Date_of_Journey, format="%d/%m/%Y").dt.day
df["Journey_month"] = pd.to_datetime(df.Date_of_Journey, format="%d/%m/%Y").dt.month
df["Journey_dow"] = pd.to_datetime(df.Date_of_Journey, format="%d/%m/%Y").dt.day_of_week

In [None]:
# Quick demo on how `to_datetime` works
demo_date = pd.to_datetime("31/08/1957", format="%d/%m/%Y")

demo_date.day # day of month 
demo_date.month # month     
demo_date.year # year
demo_date.day_of_week # Monday is 0, Sunday is 6
demo_date.day_name() # day of week (string)

In [None]:
# drop the column Date_of_Journey
df.drop(columns=["Date_of_Journey"], inplace=True) 

In [None]:
# Similar to Date_of_Journey we can extract values (hour, minute) from Dep_Time

# Extracting Hours
df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Dep_min"] = pd.to_datetime(df["Dep_Time"]).dt.minute

In [None]:
# drop Dep Time column
df.drop(columns=["Dep_Time"], inplace=True) 

In [None]:
# inspect current dataframe with .head()
df.head()

In [None]:
# create a function to convert the Dep_Time column to minutes
def convert_duration_to_minute(duration_string):
    """
    Converts a duration string to minutes
    """
    if duration_string == "":
        return 0
    else:
        duration_list = duration_string.split(" ") # split the string by space [ "1h", "30m" ]

        if len(duration_list) != 2: 
            duration_list.append("0m") if "h" in duration_list[0] else duration_list.insert(0, "0h")

        hours = int(duration_list[0].split("h")[0])
        minutes = int(duration_list[1].split("m")[0])

        return hours*60 + minutes

In [None]:
# unit tests to ensure the function works
assert convert_duration_to_minute("2h 30m") == 150
assert convert_duration_to_minute("1h 30m") == 90
assert convert_duration_to_minute("1h") == 60   
assert convert_duration_to_minute("30m") == 30
assert convert_duration_to_minute("") == 0 

# assert convert_duration_to_minute("1h 30m") == 60 # False assertion (uncomment to see error raised)

In [None]:
# convert the column Dep_Time to minutes (new column Duration_min)
df["Duration_min"] = df["Duration"].apply(convert_duration_to_minute) # using .apply

# list comprehension
# df["Duration_min"] = [ convert_duration_to_minute(duration) for duration in df["Duration"] ] 

In [None]:
# drop Duration column
df.drop(columns=["Duration"], inplace=True)

In [None]:
# head
df.head()

### Handling Categorical Data

In [None]:
# count the number of occurences of each airline
df.Airline.value_counts()

In [None]:
# Quick plot price distribution for each airline 
sns.boxplot(x="Airline", y="Price", data=df.sort_values(by='Price', ascending=False)) # boxplot of Price by Airline
plt.show()

In [None]:
# convert Airline to dummy variables (OnehotEncoding)
airlines=  pd.get_dummies(df.Airline, prefix= "Airline", drop_first= True) 
airlines

In [None]:
# determine count of each "Source"
df.Source.value_counts()

In [None]:
# plot distribution of price by Source
sns.boxplot(x="Source", y="Price", data=df.sort_values(by='Price', ascending=False)) # boxplot of Price by Source
plt.show()

In [None]:
# convert Source to dummy variables
source = pd.get_dummies(df.Source , prefix="Source", drop_first= True)
source.head()

In [None]:
# convert Destination to dummy variables
destination = pd.get_dummies(df.Destination , prefix="Dest", drop_first= True)
destination.head()

In [None]:
# inspect Additional_info from dataframe 
df.Additional_Info.value_counts() 

# we observed majority of Additional_Info is 'No info'
# Route is also too complicated to be useful (correlated to TOTAL_STOPS)


In [None]:
# drop the columns "Airline", "Source", "Destination", "Additional_Info", "Route"
df.drop(columns=["Airline", "Source", "Destination", "Additional_Info", "Route"], inplace=True) 


In [None]:
# head 
df.head()

In [None]:
# check out sample values (unique) in Total_Stops
df.Total_Stops.unique()

In [None]:
# using .map to convert Total_Stops to count of stops (int) - N_stop
# mapping used: {"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}

df["N_stop"] = df.Total_Stops.map({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4})


In [None]:
# drop the columns "Total_Stops", "Arrival_Time"
df.drop(columns=["Total_Stops", "Arrival_Time"], inplace=True) 

In [None]:
# concatenate the dataframes (df , airlines, source, destination)
data = pd.concat([df, airlines, source, destination], axis=1) 

# and check out the dataframe
data.head()


In [None]:
# Allows us to see more information regarding the DataFrame
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

data

### Handling Outliers (Subjective)

In [None]:
# boxplot on target variable- Price
sns.boxplot(x ="Price", data=data)
plt.show()

In [None]:
# if price > 40k, then assign to median price
price_median = data.Price.median()

# using list comprehension to assign median price to all prices > 40k
data["Target"] = [ price_median  if price >= 40000 else price for price in data.Price ]

In [None]:
# boxplot for Target column (expected to see now more values > 40k )
sns.boxplot(x ="Target", data=data)
plt.show()

In [None]:
# drop Price column
data.drop(columns=["Price"], inplace=True)

In [None]:
# (Optional) Write dataframe into a csv file
data.to_csv("clean_data.csv", index=False)
 

In [None]:
# see all data
data

### Modelling

In [None]:
# split data into train and test 
# test_size: proportion of data to be used for testing
# random_state: seed for random number generator

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["Target"]), data["Target"], test_size=0.2, random_state=123)

In [None]:
## Linear Regression
from sklearn.linear_model import LinearRegression

# instantiate the model and fit the model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train) # train model

training_score = linear_reg.score(X_train, y_train) # 0.6113
print("Training Score: ", training_score)

In [None]:
# Evaluation of model
from sklearn.metrics import r2_score

y_pred = linear_reg.predict(X_test)
r2score = r2_score(y_test, y_pred) 

print("R2 Score: ", r2score)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print('MAE: {}'.format(mean_absolute_error(y_test, y_pred)))
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('RSME: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))

In [None]:
# combine into a function with model as input 
def train_and_predict(model, X_train, X_test, y_train, y_test):
    """
    Train and predict using a given  model
    """

    print("Model: ", model)
    model.fit(X_train, y_train)

    print("Training Score: ", model.score(X_train, y_train))

    # predict on test data and it's metrics
    predict_with_metrics(model, X_test, y_test)


def predict_with_metrics(model, X_test, y_test):
    """
    Predict on test data and print metrics
    """

    y_pred = model.predict(X_test)
    r2score = r2_score(y_test, y_pred)
    print("R2 Score: ", r2score)

    print('MAE: {}'.format(mean_absolute_error(y_test, y_pred)))
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('RSME: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))


In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

reg_rf = RandomForestRegressor(n_estimators=100, random_state=123)
train_and_predict(reg_rf, X_train, X_test, y_train, y_test)

In [None]:
# plot feature importance
feat_importance = pd.Series(reg_rf.feature_importances_ , index = X_train.columns)

feat_importance.nlargest(20).plot(kind= 'barh')
plt.show()

In [None]:
## XGBoost Regressor 
from xgboost import XGBRegressor
xgboost_model = XGBRegressor(n_estimators=100, random_state=123)

train_and_predict(xgboost_model, X_train, X_test, y_train, y_test)

In [None]:
# plot feature importance (with xgboost)
from xgboost import plot_importance 

plot_importance(xgboost_model)
plt.show()

### Hyperparameter Tuning

Generally there's two types of hyperparameter tuning methods: 
- RandomizedSearchCV (Faster)
- GridSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV 

param_grid= {
    'n_estimators': [100, 200, 300],
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.1, 1.0],
    'subsample': [ 0.5, 0.75 , 0.9] , 
    'colsample_bytree': [ 0.5, 0.75, 0.9 ] ,
}

# estimator : The estimator being fit, here, it's the XGBoost
# param_distribution : distributoon of the possible hyper-param
# cv : number of cross-validation. iteration
# n_iter : number of hyperparam combination to choose from
# verbose: (2) print more output 

xgb_model_tuned = RandomizedSearchCV(estimator=xgboost_model, param_distributions=param_grid, n_iter= 10, cv=3, verbose=2, random_state=123)

In [None]:
# fit the model to the training data (with hyperparameters)
xgb_model_tuned.fit(X_train, y_train)

In [None]:
# view best param based on the combination above
xgb_model_tuned.best_params_

In [None]:
# predict on test data and it's metrics
predict_with_metrics(xgb_model_tuned, X_test, y_test)