# <span style="color:Blue"><center>Predict The Flight Ticket Price Hackathon</center></span>

## <span style="color:Blue">Introduction</span>

### <span style="color:green">Features</span>

## <span style="color:Blue">Objective</span>

## <span style="color:Blue">Libraries</span>

In [None]:
import numpy as np # Used for working with arrays.
# Pandas: It is made mainly for working with relational or labeled data both easily and intuitively. 
# It provides various data structures and operations for manipulating numerical data and time series.
import pandas as pd # It is used to analyze data
# Seaborn mostly used for statistical plotting in Python. 
# It is built on top of Matplotlib and provides beautiful default styles and color palettes to make statistical plots more attractive.
import seaborn as sns
import plotly.express as px # Contain a function that cretae entire figure at once.
import matplotlib.pyplot as plt # Amazing visualization library in Python for 2D plots of arrays.
import warnings # Warning is useful to alert the user of some condition in a program, where that condition (normally) doesn't warrant raising an exception and terminating the program.
warnings.filterwarnings('ignore') # To ignore all warnings by setting ‘ignore’ as a parameter.
print('Libraries imported') 
sns.set()

## <span style="color:Blue">Data Preparation</span>

In [None]:
# Read csv file and create dataframe
train_df = pd.read_excel('Data_Train.xlsx') # Read train file
test_df= pd.read_excel('Test_set.xlsx') # Read test file
sub_df=pd.read_excel('Sample_submission.xlsx') # Required format for submission in Hackathon
# Copy the dataframe for further use
df = train_df.copy()
df1 = test_df.copy()
df2 = sub_df.copy()

In [None]:
df.head() # To see the fisrt 5 rows of training dataset

In [None]:
df.sample(10) # 10 random sample data 

In [None]:
df1.head() # To see the fisrt 5 rows of testing data

In [None]:
df2.head() # To see the first 5 rows of submission data

In [None]:
df.info() # To get the information of dataset, type of features, memory uses

In [None]:
df.shape # To check the shape of data frame

In [None]:
df.describe() # To describe the data frame (only price is numerical data, others are object)

#### Missings And Duplicates Values

In [None]:
# Print number of rows of each attributes for which the value is NULL.
print(df.isna().sum().sort_values(ascending = False))
# Print number of duplicate rows considering all column features in dataframe
print('Number of Duplicate Values in df : ' ,df.duplicated().sum() )

In [None]:
df=df.drop_duplicates() # Delete all the duplicate rows considering all column wise.
df.shape # Shape of data frame is reduced.

In [None]:
# Missing value check in plot
# !pip install missingno # first required to install this library 
import missingno as msno

fig = plt.figure(figsize=(15,7)) # Size of complete figure
# Normal Axis
ax1 = fig.add_subplot(1,2,1) # Created subplots in 1 rows and 2 columns
msno.bar(df, color="tomato", fontsize=12, ax=ax1); # Define size of bar
# Logrithmic Y-Axis
ax2 = fig.add_subplot(1,2,2) # Created subplots in 1 rows and 2 columns
msno.bar(df, log=True, color="tab:green", fontsize=12, ax=ax2); # Define size of bar

plt.tight_layout() # Show the layout

In [None]:
# Check the missing value of route and total stops from same row or not
df[df['Route'].isna() | df['Total_Stops'].isna()]

In [None]:
# Drop the null values
df.dropna(inplace= True)
# Check null values again
df.isna().sum()

In [None]:
# Print number of rows of each attributes for which the value is NULL.
print(df.isna().sum().sort_values(ascending = False))

In [None]:
# Shape of dataframe
df.shape

## <span style="color:Blue">EDA & Feature Engineering</span>

### <span style="color:green">Features Engineering</span>

### Convert Duration

In [None]:
# Convert Duration coulumn into minutes
def convert_duration(duration):
    if len(duration.split())==2:
        hours = int(duration.split()[0][:-1]) # 1st index of duration is hour
        minutes= int(duration.split()[1][:-1]) # 2nd index of duration is minutes
        return hours*60+minutes # Convert all into minutes
    else:
        return int(duration[:-1])*60 # If no hour index

In [None]:
# Calling the conversion function
df['Duration_in_Minutes'] = df['Duration'].apply(convert_duration)
# Drop the previous useless column
df.drop(['Duration'], axis=1, inplace=True)
df.head()

### Departure Time & Arrival Time

In [None]:
# Convert datatype of dep and arrival time into datetime format
df['Dep_Time']=pd.to_datetime(df['Dep_Time'])
df['Arrival_Time']=pd.to_datetime(df['Arrival_Time'])
# Data Types of features
df.dtypes

In [None]:
# Create departure and arrival time into minute and hour into different column
df['Dep_Time_in_hours'] = df['Dep_Time'].dt.hour
df['Dep_Time_in_minutes'] = df['Dep_Time'].dt.minute
df['Arrival_Time_in_hours'] = df['Arrival_Time'].dt.hour
df['Arrival_Time_in_minutes'] = df['Arrival_Time'].dt.minute

In [None]:
df.head()

In [None]:
# Drop Dep_Time and Arrival_Time features
df.drop(['Dep_Time','Arrival_Time'], axis=1, inplace=True)
df.head()

### Date of Journey

In [None]:
# Convert the data type into datetime format
df['Date_of_Journey']=pd.to_datetime(df['Date_of_Journey'])
df.head()

In [None]:
# Check, weather year of date of journey is unique or not.
df['Date_of_Journey'].dt.year.unique()

In [None]:
# Extracted day and month column from Date_of_Journey column
df['Day']= df['Date_of_Journey'].dt.day
df['Month']= df['Date_of_Journey'].dt.month
# Drop the useless column Date_of_Journey
df.drop(['Date_of_Journey'], axis=1, inplace=True)
df.head()

### Additional Info

In [None]:
# Check the category in Additional_Info feature
df['Additional_Info'].value_counts()

In [None]:
# Drop Additional_Info feature
df.drop('Additional_Info', axis=1, inplace=True)
df.head()

In [None]:
# Check the number of features which datatype = Object
df.select_dtypes(['object']).columns

### <span style="color:green">Univariate Analysis with Categorical Features</span>

In [None]:
# Univariate Analysis with categorical features 
# ('Airline', 'Source', 'Destination', 'Total_Stops', 'Route')
for i in ['Airline', 'Source', 'Destination', 'Total_Stops','Route']:
    plt.figure(figsize=(25,12))
    sns.countplot(data=df, x=i)
    axis = sns.countplot(x=i,data=df.sort_values('Price',ascending=True))
    axis.set_xticklabels(axis.get_xticklabels(), rotation=40, ha='right')
    plt.tight_layout()
    plt.show()
    print('\n')

#### Insights

### <span style="color:green">Bivariate Analysis with Categorical Features</span>

In [None]:
# Bivariate Analysis of categorical features with Price (Bar Plot)
for i in ['Airline', 'Source', 'Destination', 'Total_Stops', 'Route']:
    plt.figure(figsize=(25,12))
    axis = sns.barplot(x=i,y='Price',data=df.sort_values('Price',ascending=True))
    axis.set_xticklabels(axis.get_xticklabels(), rotation=40, ha='right')
    plt.tight_layout()
    plt.show()
    print('\n')

#### Insights

In [None]:
# Box Plot 
for i in ['Airline', 'Source', 'Destination', 'Total_Stops','Route']:
    plt.figure(figsize=(20,9))
    axis = sns.boxplot(x=i,y='Price', data=df.sort_values('Price',ascending=True))
    axis.set_xticklabels(axis.get_xticklabels(), rotation=40, ha='right')
    plt.tight_layout()
    plt.show()
    print('\n')

#### Insights

### Total Stops

In [None]:
# Frequency of each category of Total_Stops
df['Total_Stops'].value_counts()

In [None]:
# Label Encoder
df['Total_Stops']=df['Total_Stops'].map({
    'non-stop':0,
    '1 stop':1,
    '2 stops':2,
    '3 stops':3,
    '4 stops':4
})

In [None]:
df.head()

#### Parallel Coordinates Plot

In [None]:
# To understand the source and destination considering total stops
## Creating parallel categories chart
# Import go 
import plotly.express as px
import plotly.graph_objects as go
# Create dimensions
# Source
source_dim = go.parcats.Dimension(
    values=df["Source"], categoryorder="category ascending", label="Source"
)
# Airline
airline_dim = go.parcats.Dimension(
    values=df["Airline"], label="Airline"
)
# Destination
destination_dim = go.parcats.Dimension(
    values=df["Destination"], label="Destination"
)
# Total Stops
total_stops_dim = go.parcats.Dimension(
    values=df["Total_Stops"],
    label="Total Stops",
    categoryarray=[0,1,2,3,4],
    ticktext=["non-stop", "1-stop", "2-stops", "3-stops", "4-stops"],
)
# Create parcats trace
color = df["Total_Stops"]
#colorscale = [[0, 'green'], [0.5, 'red'], [1.0, 'rgb(0, 0, 255)']]
colorscale = 'Electric'
#colorscale = px.colors.diverging.Tealrose
# create figure object
fig = go.Figure(
    data=[
        go.Parcats(
            dimensions=[
                source_dim,
                destination_dim,
                airline_dim,
                total_stops_dim
            ],
            line={"color": color, "colorscale": colorscale},
            hoveron="color",
            hoverinfo="count + probability",
            labelfont={"size": 18, "family": "Times"},
            tickfont={"size": 16, "family": "Times"},
            arrangement="freeform",
        )
    ]
)
# display the figure
fig.show()

#### Insights

### Airline

In [None]:
# Check the frequency of each category in Airline
df['Airline'].value_counts()

In [None]:
# Sort the airlines with average value of price
df.groupby('Airline').describe()['Price'].sort_values('mean', ascending =False)

In [None]:
# One Hot Encoding: Create new column for each category of Airline Feature
Airline = pd.get_dummies(df['Airline'], drop_first=True)
Airline.head()

In [None]:
# Concatinate Airline dataframe with main dataframe.
df = pd.concat([df, Airline], axis = 1)
# Drop useless column Airline
df.drop('Airline', axis=1, inplace=True)
df.head()

### Source & Destination

In [None]:
# Check the frequency of each category of Source and Destination Features
li = ['Source', 'Destination']
for i in li:
    print(df[[i]].value_counts(),'\n')

In [None]:
# Create dummies of Source and Destination Feature
df = pd.get_dummies(data=df, columns = li, drop_first= True)
df.head()

### Route

In [None]:
# Understanding the Route feature
route = df[['Route']]
route.head()

In [None]:
# Total stops between source and destination
df['Total_Stops'].value_counts()

In [None]:
# Create 5 new feature column for each stop.
route['Route_1']=route['Route'].str.split('→').str[0]
route['Route_2']=route['Route'].str.split('→').str[1]
route['Route_3']=route['Route'].str.split('→').str[3]
route['Route_4']=route['Route'].str.split('→').str[4]
route['Route_5']=route['Route'].str.split('→').str[5]
route.head()

In [None]:
# Fill the NaN value with None
route.fillna('None', inplace=True)
route.head()

In [None]:
# Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in range(1,6):
    col = 'Route_'+ str(i)
    route[col]=le.fit_transform(route[col]) # Transfer text into numeric form
route.drop('Route', axis=1, inplace =True)
route.head()

In [None]:
# Concatinate Route Dataframe into main dataframe
df = pd.concat([df, route], axis = 1)
# Drop the useless feature Route now
df.drop('Route',axis=1, inplace=True)
df.head()

In [None]:
# Create fontdicts for formatting figure text
axtitle_dict = {'family': 'serif','color':  'red','weight': 'bold','size': 16}
axlab_dict = {'family': 'serif', 'color': 'black','size': 14}

#### Filtering Numericals and Categoricals columns

In [None]:
# Extracted categorical columns having non-unique value less than 50
categ_columns = []
for col in df.columns:
    if df[col].nunique()<=50:
        if col!='Price':
            categ_columns.append(col) 
print('categorical numericals columns are {}'.format(categ_columns))

In [None]:
# Extracted numerical columns
Num_cols = [col for col in df.columns if col not in categ_columns]
print('numericals columns are {}'.format(Num_cols)) 
Num_cols.remove('Price') # Removing Price (output feature)
Num_cols

### <span style="color:forestgreen">Univariate Analysis After Feature Engineering</span>

#### Distplot of Numerical Features

In [None]:
#create figure with 3 x 3 grid of subplots
fig = plt.figure(figsize=[15,12])
fig.suptitle('DISTPLOT OF DATA', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.92);
fig.subplots_adjust(hspace=0.5, wspace=0.4);
for i ,col in enumerate(Num_cols):
    ax = fig.add_subplot(1, 1, i+1)
    ax = sns.distplot(df[col],  color='dodgerblue')
    ax.axvline(df[col].quantile(q=0.25),color='green',linestyle='--',label='25% Quartile')
    ax.axvline(df[col].mean(),color='red',linestyle='--',label='Mean')
    ax.axvline(df[col].median(),color='black',linestyle='--',label='Median')
    ax.axvline(df[col].quantile(q=0.75),color='blue',linestyle='--',label='75% Quartile')
    # ax.text('skewness: {}' .format(str(round(df[col].skew(),3))), ha='right', va='center', size=11)
    ax.set_xlabel(f'{col}', fontdict=axlab_dict)
    ax.set_title(f'{col.upper()}    skewness {round(df[col].skew(),3)}', fontdict=axtitle_dict)
    ax.legend(fontsize=10) 

In [None]:
colours = ['forestgreen','dodgerblue','goldenrod', 'coral' , 'silver' , 'gold' , 'dodgerblue', 'green', 'red', 'blue'];

#### Outliers Detection in Numerical Features

In [None]:
# Check of outliers by applying the IQR method checking if values are way outside the IQR borders.
# Numerical_features = ["Duration_in_Minutes"]
df_num = df[Num_cols]
df_num.describe()

Q1 = df_num.quantile(0.25)
Q3 = df_num.quantile(0.75)
IQR = Q3 - Q1
IQR
((df_num < (Q1 - 1.5 * IQR)) | (df_num > (Q3 + 1.5 * IQR))).any()

#### Visualization of outliers using box plot

In [None]:
# Create figure with 3 x 3 grid of subplots
fig = plt.figure(figsize=[16,12])
fig.suptitle('BOXPLOT OF DATA', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.92);
fig.subplots_adjust(hspace=0.5, wspace=0.4);
for i ,col in enumerate(Num_cols):  
    ax1 = fig.add_subplot(1, 1, i+1);
    ax1 = sns.boxplot(data = df, x=col ,  color= colours[i]);
 
    ax1.set_title(f'{col}', fontdict=axtitle_dict) 
    ax1.set_xlabel(f'{col}', fontdict=axlab_dict)

#### Outliers Detection

In [None]:
# Finding the IQR For Duration_in_Minutes columns
dict = {}
for col in ['Duration_in_Minutes']:
    percentile25 = df[col].quantile(0.25)
    percentile75 = df[col].quantile(0.75)
    IQR  = percentile75 - percentile25 
    upper_limit = percentile75 + 1.5 * IQR 
    lower_limit = percentile25 - 1.5 * IQR
    dict['upper_limit'+ '_' + col] = upper_limit
    dict['lower_limit'+ '_' + col] = lower_limit 

In [None]:
# Find upper limit and lower limit
dict

In [None]:
# Number of data in the range
for col in ['Duration_in_Minutes']:
    print('There are total {} data which {} are less than lower limit.'.format(len(df[df[col] < dict['lower_limit_' + col]] ) , col))
    print('There are total {} data which {} are more than upper limit.'.format(len(df[df[col] > dict['upper_limit_' + col]] ) , col))

#### Capping Duration_in_Minutes with upper limit and lower limit.

In [None]:
# Removing the outliers
for col in ['Duration_in_Minutes']:
    df[col] = np.where(
        df[col] > dict['upper_limit_' + col],
        dict['upper_limit_' + col],
        np.where(
            df[col] < dict['lower_limit_' + col],
            dict['lower_limit_' + col],
            df[col]  
        )
    )

#### After Outliers treatment

In [None]:
# Create figure with 3 x 3 grid of subplots
fig = plt.figure(figsize=[16,12])
fig.suptitle('BOXPLOT After Outliers Handling', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.92);
fig.subplots_adjust(hspace=0.5, wspace=0.4);
for i ,col in enumerate( ['Duration_in_Minutes']):  
    ax1 = fig.add_subplot(1, 1, i+1);
    ax1 = sns.boxplot(data = df, x=col ,  color= colours[i]);
 
    ax1.set_title(f'{col}', fontdict=axtitle_dict) 
    ax1.set_xlabel(f'{col}', fontdict=axlab_dict)

### <span style="color:forestgreen">Correlation Analysis</span>

In [None]:
# Correlation with output feature
ax = round(df.corr()['Price'].sort_values(ascending = False)[1:] ,2 ).plot(kind = 'bar' ,color='dodgerblue' , figsize = (15,10))
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
# Name of columns
df.columns

In [None]:
# I have dropped the feature which is not correlated with output feature
df.drop('Vistara Premium economy',axis=1, inplace=True)

### <span style="color:forestgreen">Heatmap</span>

In [None]:
# Plot correlation matrix heatmap
fig, ax = plt.subplots(figsize=[25,10])
sns.heatmap(df.corr(), ax=ax,  annot=True, linewidths=0.05, fmt= '.2f',cmap='RdBu')
ax.tick_params(axis='both', which='major', labelsize=14)
ax.set_title('Dataset Correlation Matrix', fontdict=axtitle_dict)
fig.show()

In [None]:
# I have dropped the one of feature having correlation coffecient more than 0.75 or less than -0.75
df.drop('Source_Chennai',axis=1, inplace=True)
df.drop('Source_Delhi',axis=1, inplace=True)
df.drop('Source_Mumbai',axis=1, inplace=True)

In [None]:
# Check final shape of dataframe
df.shape

### <span style="color:forestgreen">Feature Scaling</span>

In [None]:
# Divided the data into two part x and y. Where, x is independent variable and y is dependent variable or target variable 
x = df.drop(columns=['Price']).values  
y = df['Price'].values 

In [None]:
# Convert Dataset into min_max scale
from sklearn.preprocessing import MinMaxScaler
x = MinMaxScaler().fit_transform(x)

## <span style="color:Blue">Modelling</span>

In [None]:
# Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn import metrics
from math import sqrt
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import KFold

In [None]:
# Split the dataset
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
# Function to find mean absolute % error
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

### <span style="color:forestgreen">Linear Regression</span>

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
print('In Training:')
print('R^2 Score:', lr.score(X_train, y_train))

In [None]:
print('In Testing:')
print('R^2 Score:',r2_score(y_test, y_pred))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_test, y_pred)))

### <span style="color:forestgreen">Polynomial Regression</span>

In [None]:
poly = PolynomialFeatures(degree = 2)
X_poly = poly.fit_transform(x)

poly.fit(X_poly, y)
lin2 = LinearRegression()
lin2.fit(X_poly, y)

In [None]:
print('In Training:')
y_pred = lin2.predict(poly.fit_transform(X_test))
y_pred_train = lin2.predict(poly.fit_transform(X_train))
print('R^2 Score:', r2_score(y_train,y_pred_train))

In [None]:
print('In Testing:')
print('R^2 Score:',r2_score(y_test, y_pred))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_test, y_pred)))

### <span style="color:forestgreen">Random Forest</span>

In [None]:
reg_rf = RandomForestRegressor()
reg_rf.fit(X_train, y_train)
y_pred = reg_rf.predict(X_test)

In [None]:
print('In Training:')
print('R^2 Score:', reg_rf.score(X_train, y_train))

In [None]:
plt.scatter(y_test, y_pred, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

In [None]:
print('In Testing:')
print('R^2 Score:',metrics.r2_score(y_test, y_pred))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Mean absolute % error: ", round(mean_absolute_percentage_error(y_test, y_pred)))

### <span style="color:forestgreen">Random Forest with Hyperparameter Tuning</span>

In [None]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
# Random search of parameters, using 5 fold cross validation, 
# Search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2,n_jobs=-1,)
rf_random.fit(X_train,y_train)

# Best parameter
rf_random.best_params_

In [None]:
y_prediction = gcv_rf.predict(X_test)
y_train_pred = gcv_rf.predict(X_train)

In [None]:
print('In Training:')
print('R^2 Score:', r2_score(y_train, y_train_pred))

In [None]:
plt.figure(figsize = (8,8))
plt.scatter(y_test, y_prediction, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

In [None]:
print('In Testing:')
print('R^2 Score:',metrics.r2_score(y_test, y_prediction))
print('MAE:', metrics.mean_absolute_error(y_test, y_prediction))
print('MSE:', metrics.mean_squared_error(y_test, y_prediction))
print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_test, y_prediction)))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_prediction)))

### <span style="color:forestgreen">XGB Regressor with Hyperparameter Tuning</span>

In [None]:
# Grid Search CV
param_grid = {'alpha': [0.9,0.09,0.1,0.7,0.05,0.125],'learning_rate':[0.75,0.5,0.25,0.1,0.01],
              'max_depth':[2,3,4,5,6],'n_estimators':[125,110, 100, 90, 75]}
xgb = XGBRegressor()
gcv_xgb= GridSearchCV(xgb, param_grid, scoring='neg_mean_absolute_error',cv=5, n_jobs=5, verbose=True)
res = gcv_xgb.fit(X_train,y_train)
res.best_params_

In [None]:
y_train_pred = gcv_xgb.predict(X_train)
y_test_pred = gcv_xgb.predict(X_test)

In [None]:
print('In Training:')
print('R^2 Score:',r2_score(y_train,y_train_pred))

In [None]:
print('In Testing:')
print("R^2 Score: ", r2_score(y_test, y_test_pred))
print('MAE:', metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_test_pred))
print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_test, y_test_pred)))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

### <span style="color:forestgreen">Weighted Average Ensemble</span>

In [None]:
from sklearn.ensemble import VotingRegressor
from numpy import argsort
# To get a list of best models
def get_models():
    models = list()
    models.append(('Random Forest with Hyperparameter Tuning', rf_random))
    models.append(('XGB Regressor with Hyperparameter Tuning', gcv_xgb))
    return models
# To evaluate each base model
def evaluate_models(models, X_train, X_test, y_train, y_test):
    # Fit and evaluate the models
    scores = list()
    for name,model in models:
        # Fit the model
        model.fit(X_train,y_train)
        # Evaluate the model
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test,y_pred)
        # Store the performance
        scores.append(r2)
    return scores
models = get_models()
scores = evaluate_models(models, X_train, X_test, y_train, y_test)
#print(scores)
ranking = 1 + argsort(argsort(scores))
#print(ranking)
# Create the ensemble
ensemble = VotingRegressor(estimators=models, weights=ranking)
# Fit the ensemble on the training dataset
ensemble.fit(X_train, y_train)

In [None]:
print('In Training:')
y_pred_train = ensemble.predict(X_train)
print('Weighted R^2 Score: ' (r2_score(y_train, y_pred_train)))

In [None]:
# Make predictions on test set
y_pred = ensemble.predict(X_test)
# Evaluate predictions
print('Weighted R^2 Score: ' (r2_score(y_test, y_pred)))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_test, y_pred)))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### <span style="color:forestgreen">Prediction</span>