In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

  from pandas.core import (


In [2]:
df = pd.read_csv(r"C:\Users\HARSHIT JAIN\Desktop\data science projects\visit_and_tour_packages_project\datasets\tour_package_post_feature_selection_2.csv")

In [3]:
df.head()

Unnamed: 0,category_of_package,count_of_cities,no_of_days,categories_by_price,accommodation_facility,price
0,holiday_package,1,3,budget,1 star,8000.0
1,holiday_package,2,4,budget,1 star,10000.0
2,holiday_package,3,5,budget,1 star,12499.0
3,holiday_package,6,7,standard,3 star,20000.0
4,offbeat_package,7,8,standard,3 star,23500.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2428 entries, 0 to 2427
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   category_of_package     2428 non-null   object 
 1   count_of_cities         2428 non-null   int64  
 2   no_of_days              2428 non-null   int64  
 3   categories_by_price     2428 non-null   object 
 4   accommodation_facility  2428 non-null   object 
 5   price                   2428 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 113.9+ KB


In [5]:
X = df.drop(columns=['price'])
y = df['price']

In [6]:
y_transformed = np.log1p(y)

# Ordinal Encoding

In [7]:
columns_to_encode = ['category_of_package','categories_by_price', 'accommodation_facility']

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['count_of_cities','no_of_days']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [9]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [10]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [11]:
scores.mean(),scores.std()

(0.8253716475068265, 0.013031634671808078)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [13]:
pipeline.fit(X_train,y_train)

In [14]:
y_pred = pipeline.predict(X_test)

In [15]:
y_pred = np.expm1(y_pred)

In [16]:
mean_absolute_error(np.expm1(y_test),y_pred)

3484.4421031864213

In [17]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [18]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [19]:
model_output = []

for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [20]:
model_output

[['linear_reg', 0.8253716475068265, 3484.4421031864213],
 ['svr', 0.8504838564845254, 3069.5049348867083],
 ['ridge', 0.8253730548554563, 3484.35948460985],
 ['LASSO', -0.009259192796517013, 9418.699895206115],
 ['decision tree', 0.8422653849918454, 2809.1435992422107],
 ['random forest', 0.8621638900884807, 2745.082847114479],
 ['extra trees', 0.8527472156156936, 2675.8543915215014],
 ['gradient boosting', 0.8688936638162575, 2962.4391324277726],
 ['adaboost', 0.8453529998734384, 3426.191717035718],
 ['mlp', 0.801726504149458, 3712.5780703848823],
 ['xgboost', 0.8537751160748079, 2733.9449086130394]]

In [21]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [22]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.852747,2675.854392
10,xgboost,0.853775,2733.944909
5,random forest,0.862164,2745.082847
4,decision tree,0.842265,2809.143599
7,gradient boosting,0.868894,2962.439132
1,svr,0.850484,3069.504935
8,adaboost,0.845353,3426.191717
2,ridge,0.825373,3484.359485
0,linear_reg,0.825372,3484.442103
9,mlp,0.801727,3712.57807


#  OneHotEncoding

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['count_of_cities','no_of_days']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['category_of_package','categories_by_price', 'accommodation_facility'])
    ], 
    remainder='passthrough'
)

In [24]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [25]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [26]:
scores.mean()

0.8435362465606501

In [27]:
scores.std()

0.008942314496823252

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [29]:
pipeline.fit(X_train,y_train)

In [30]:
y_pred = pipeline.predict(X_test)

In [31]:
y_pred = np.expm1(y_pred)

In [32]:
mean_absolute_error(np.expm1(y_test),y_pred)

3217.41378448557

In [33]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [34]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [35]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [36]:
model_df_2 = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [37]:
model_df_2.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.853396,2712.353869
5,random forest,0.864963,2735.458054
10,xgboost,0.855942,2749.727861
4,decision tree,0.842448,2843.232346
7,gradient boosting,0.87085,2888.959896
1,svr,0.851055,3023.404172
2,ridge,0.843653,3212.956448
0,linear_reg,0.843536,3217.413784
9,mlp,0.824683,3220.90333
8,adaboost,0.848897,3460.161331


# OneHotEncoding With PCA

In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['count_of_cities','no_of_days']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['category_of_package','categories_by_price', 'accommodation_facility'])
    ], 
    remainder='passthrough'
)

In [39]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [40]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [41]:
scores.mean()

0.8189982668784943

In [42]:
scores.std()

0.012786342306564068

In [43]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [44]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [45]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [46]:
model_df_3 = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [47]:
model_df_3.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.851307,2800.701923
10,xgboost,0.848232,2802.601259
5,random forest,0.858131,2885.263702
1,svr,0.860682,3044.387921
4,decision tree,0.834756,3138.048539
7,gradient boosting,0.857584,3196.69403
2,ridge,0.819,3764.870191
0,linear_reg,0.818998,3765.157203
8,adaboost,0.806274,4731.367753
9,mlp,0.535498,7274.490027


# Target Encoder

In [48]:
import category_encoders as ce

columns_to_encode = ['category_of_package','categories_by_price', 'accommodation_facility']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['count_of_cities','no_of_days']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(),['categories_by_price']),
        ('target_enc', ce.TargetEncoder(), ['category_of_package'])
    ], 
    remainder='passthrough'
)

In [49]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [50]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [51]:
scores.mean(),scores.std()

(0.8409880916568365, 0.01101330032336298)

In [52]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [53]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [54]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [55]:
model_df_4 = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [56]:
model_df_4.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.852474,2707.151506
5,random forest,0.861665,2766.739029
10,xgboost,0.854567,2774.306958
4,decision tree,0.840934,2835.243269
7,gradient boosting,0.868362,2905.575304
1,svr,0.837899,3126.809041
2,ridge,0.840944,3271.708511
0,linear_reg,0.840988,3295.653513
9,mlp,0.839718,3331.983703
8,adaboost,0.84512,3382.24151


# Hyperparameter Tuning

In [57]:
from sklearn.model_selection import GridSearchCV

In [58]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples': [0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}


In [59]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['count_of_cities','no_of_days']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [60]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [61]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [62]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [63]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


  warn(


In [64]:
final_pipe = search.best_estimator_

In [65]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'auto',
 'regressor__max_samples': 0.1,
 'regressor__n_estimators': 200}

In [66]:
search.best_score_

0.8737148016949113

In [67]:
final_pipe.fit(X,y_transformed)

  warn(


# Exporting the model

In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['count_of_cities','no_of_days']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [69]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators= 50, max_depth = 10, max_features='sqrt', max_samples= 0.25))
])

In [70]:
pipeline.fit(X,y_transformed)

In [71]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [72]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [73]:
X

Unnamed: 0,category_of_package,count_of_cities,no_of_days,categories_by_price,accommodation_facility
0,holiday_package,1,3,budget,1 star
1,holiday_package,2,4,budget,1 star
2,holiday_package,3,5,budget,1 star
3,holiday_package,6,7,standard,3 star
4,offbeat_package,7,8,standard,3 star
...,...,...,...,...,...
2423,beach_holiday,3,5,budget,1 star
2424,beach_holiday,4,5,budget,1 star
2425,family_package,3,4,standard,3 star
2426,nature_package,7,8,premium,4 star


# Trying out the predictions

In [74]:
X.columns

Index(['category_of_package', 'count_of_cities', 'no_of_days',
       'categories_by_price', 'accommodation_facility'],
      dtype='object')

In [75]:
X.iloc[0].values

array(['holiday_package', 1, 3, 'budget', '1 star'], dtype=object)

In [76]:
data = [['family_package', 8, 12, 'budget','1 star']]
columns = ['category_of_package', 'count_of_cities', 'no_of_days','categories_by_price', 'accommodation_facility']

one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,category_of_package,count_of_cities,no_of_days,categories_by_price,accommodation_facility
0,family_package,8,12,budget,1 star


In [77]:
np.expm1(pipeline.predict(one_df))

array([12694.46340484])

In [78]:
X.dtypes

category_of_package       object
count_of_cities            int64
no_of_days                 int64
categories_by_price       object
accommodation_facility    object
dtype: object