In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

sns.set_style("whitegrid")

In [4]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.cluster import KMeans

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.preprocessing import OneHotEncoder


from scipy import stats

In [5]:
def preprocessing(df):
  df["class"] = df["class"].replace({'Economy':0,'Business':1}).astype(int)
  df["stops"] = df["stops"].replace({'zero':0,'one':1,'two_or_more':2}).astype(int)

  dummies_variables = ["airline","source_city","destination_city","departure_time","arrival_time"]
  dummies = pd.get_dummies(df[dummies_variables], drop_first= True)
  df = pd.concat([df,dummies],axis=1)
    
  df = df.drop(["flight","airline","source_city","destination_city","departure_time","arrival_time"],axis=1)
    
  return df

In [38]:
df = pd.read_csv("/content/drive/MyDrive/Dataset/dataset.csv")

In [6]:
def load_data():
    # Read data
    df = pd.read_csv("/content/drive/MyDrive/Dataset/dataset.csv")
    # Preprocessing the data
    
    df = preprocessing(df)
    
    X = df.copy()
    y = X.pop("price")
    
    xtrain,xtest,ytrain,ytest = train_test_split(X,y,random_state = 1,test_size=0.3, shuffle=True)
    
    return xtrain,xtest,ytrain,ytest

In [7]:
xtrain,xtest,ytrain,ytest = load_data()

In [14]:
xtrain.shape

(210107, 30)

In [17]:
xtest.shape

(90046, 30)

In [18]:
xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210107 entries, 107174 to 128037
Data columns (total 30 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Unnamed: 0                    210107 non-null  int64  
 1   stops                         210107 non-null  int64  
 2   class                         210107 non-null  int64  
 3   duration                      210107 non-null  float64
 4   days_left                     210107 non-null  int64  
 5   airline_Air_India             210107 non-null  uint8  
 6   airline_GO_FIRST              210107 non-null  uint8  
 7   airline_Indigo                210107 non-null  uint8  
 8   airline_SpiceJet              210107 non-null  uint8  
 9   airline_Vistara               210107 non-null  uint8  
 10  source_city_Chennai           210107 non-null  uint8  
 11  source_city_Delhi             210107 non-null  uint8  
 12  source_city_Hyderabad         210107 no

In [19]:
xtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90046 entries, 135562 to 248798
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    90046 non-null  int64  
 1   stops                         90046 non-null  int64  
 2   class                         90046 non-null  int64  
 3   duration                      90046 non-null  float64
 4   days_left                     90046 non-null  int64  
 5   airline_Air_India             90046 non-null  uint8  
 6   airline_GO_FIRST              90046 non-null  uint8  
 7   airline_Indigo                90046 non-null  uint8  
 8   airline_SpiceJet              90046 non-null  uint8  
 9   airline_Vistara               90046 non-null  uint8  
 10  source_city_Chennai           90046 non-null  uint8  
 11  source_city_Delhi             90046 non-null  uint8  
 12  source_city_Hyderabad         90046 non-null  uint8  


In [20]:
def score_dataset(X, y, model=XGBRegressor()):
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    
    score_r2 = cross_val_score(model, X, y, cv=5, scoring="r2")
    score_r2 = score_r2.mean()
    
    return score_r2

In [21]:
models = {}

models["LinearRegression"]={"model":LinearRegression()}
models["KNeighborsRegressor"]={"model":KNeighborsRegressor(n_neighbors=50)}
# models["BaggingRegressor"]={"model":BaggingRegressor()}
models["XGBRegressor"]={"model":XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)}
# models["ExtraTreeRegressor"]={"model":ExtraTreesRegressor()}


models

{'LinearRegression': {'model': LinearRegression()},
 'KNeighborsRegressor': {'model': KNeighborsRegressor(n_neighbors=50)},
 'XGBRegressor': {'model': XGBRegressor(max_depth=10, n_jobs=5, random_state=1)}}

In [22]:
def get_scores(models,xtrain,ytrain):
    for name,model in models.items():
        model["model"].fit(xtrain,ytrain)

        score_r2 = score_dataset(xtrain, ytrain, model=model["model"])
        score = {'model':"Linear regression", 'score_r2':score_r2}
        print("--- "+name+" ---")
        print("Score r2: {}".format(score_r2))
        print("\n")

In [23]:
get_scores(models,xtrain,ytrain)

--- LinearRegression ---
Score r2: 0.910291518573523


--- KNeighborsRegressor ---
Score r2: 0.896140193921626


--- XGBRegressor ---
Score r2: 0.9821189244856147




In [25]:
models_2 = {}

models_2["LinearRegression"]={"model":LinearRegression()}
models_2["BaggingRegressor"]={"model":BaggingRegressor()}
models_2["ExtraTreeRegressor"]={"model":ExtraTreesRegressor()}

def get_scores_2(models,xtrain,ytrain):
    for name,model in models.items():
        model["model"].fit(xtrain,ytrain)

        score_r2 = score_dataset(xtrain, ytrain, model=model["model"])
        # score = {'model':"Linear regression", 'score_r2':score_r2}
        print("--- "+name+" ---")
        print("Score r2: {}".format(score_r2))
        print("\n")

In [26]:
get_scores_2(models_2, xtrain, ytrain)

--- LinearRegression ---
Score r2: 0.910291518573523


--- BaggingRegressor ---
Score r2: 0.9856655556163554


--- ExtraTreeRegressor ---
Score r2: 0.9849984948635588




In [39]:
x=df.drop(['price'],axis=1)
y=df['price']

In [33]:
def MAPE (ytest, ypred):
    ytest, ypred = np.array(ytest), np.array(ypred)
    return np.mean(np.abs((ytest - ypred) / ytest)) * 100

In [27]:
from sklearn import metrics

In [47]:
def metric(models, xtrain, ytrain, xtest, ytest):
    for name,model in models.items():
        model["model"].fit(xtrain,ytrain)
        
        score_r2 = score_dataset(xtrain, ytrain, model=model["model"])
        print("--- "+name+" ---")
        print("R2 Score [Trained]: {}".format(score_r2))
        print("\n")


        ypred = model["model"].predict(xtest)



        result = MAPE(ytest, ypred)
    

        print("--- "+name+" ---")
        print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(ytest, ypred),3))  
        print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(ytest, ypred),3))  
        print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(ytest, ypred)),3))
        print('R2 Score:', round(metrics.r2_score(ytest, ypred),6))
        print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(ytest, ypred))),3))
        print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
        
        r_squared = round(metrics.r2_score(ytest, ypred),6)
        adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
        print('Adjusted R Square: ', adjusted_r_squared)

        print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')

In [48]:
models_3 = {}

models_3["LinearRegression"]={"model":LinearRegression()}
models_3["KNeighborsRegressor"]={"model":KNeighborsRegressor(n_neighbors=50)}
models_3["BaggingRegressor"]={"model":BaggingRegressor()}
models_3["ExtraTreeRegressor"]={"model":ExtraTreesRegressor()}

metric(models_3, xtrain, ytrain, xtest, ytest)

--- LinearRegression ---
R2 Score [Trained]: 0.910291518573523


--- LinearRegression ---
Mean Absolute Error (MAE): 4470.912
Mean Squared Error (MSE): 45332990.205
Root Mean Squared Error (RMSE): 6732.978
R2 Score: 0.911736
Root Mean Squared Log Error (RMSLE): 8.815
Mean Absolute Percentage Error (MAPE): 42.42 %
Adjusted R Square:  0.911733
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

--- KNeighborsRegressor ---
R2 Score [Trained]: 0.896140193921626


--- KNeighborsRegressor ---
Mean Absolute Error (MAE): 3730.621
Mean Squared Error (MSE): 49019919.562
Root Mean Squared Error (RMSE): 7001.423
R2 Score: 0.904557
Root Mean Squared Log Error (RMSLE): 8.854
Mean Absolute Percentage Error (MAPE): 23.81 %
Adjusted R Square:  0.904554
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

--- BaggingRegressor ---
Mean Absolute Error (MAE): 1012.074
Mean Squared Error (MSE): 6657099.35
Root Mean Squared Error (RMSE): 2580.136
R2 Score: 0.98703

In [None]:
models_4 = {}
models_4["XGBRegressor"]={"model":XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)}

metric(models_4, xtrain, ytrain, xtest, ytest)