In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import r2_score
import seaborn
import matplotlib.pyplot as plt

### Reading data

In [4]:
data = pd.read_excel('cleaned_Travelpac_removed_outliers.xlsx', 'Sheet1')
print(data.shape)
data.head()

(316678, 13)


Unnamed: 0,Age,Duration_of_Visit,Holiday_Package,Sex,Year,country,mode,purpose,quarter,where_contact_lives,visits,nights,Amount_Spent
0,0-15,4-13 nights,Independent,Male,2009,Austria,Air,Holiday,Jan-Mar,UK residents,1600.680374,11204.762616,1103402.0
1,0-15,4-13 nights,Independent,Female,2009,Austria,Air,Holiday,Jan-Mar,UK residents,1600.680374,11204.762616,1125278.0
2,16-24,4-13 nights,Independent,Male,2009,Austria,Air,Holiday,Jan-Mar,UK residents,3064.743058,20873.377956,1622982.0
3,16-24,4-13 nights,Independent,Female,2009,Austria,Air,Holiday,Jan-Mar,UK residents,2702.755561,12411.702616,1164191.0
4,16-24,14-27 nights,Independent,Female,2009,Austria,Air,Holiday,Jan-Mar,UK residents,525.351507,7354.921102,763335.7


# Model to predict Amount spent
### Splittin data into target and feature variables

In [5]:
#target variable
labels = data['Amount_Spent']
#Include all other variables as feature variables
features = data[['Age','Sex','country','Duration_of_Visit','Year','mode',
                     'purpose','quarter','where_contact_lives', 'visits','nights', 'Holiday_Package']]

print(f'label shape {labels.shape}')
print('First Five sample dataset for labels')
print(f'features shape {features.shape}')
print(labels.head())
features.head()

label shape (316678,)
First Five sample dataset for labels
features shape (316678, 12)
0    1.103402e+06
1    1.125278e+06
2    1.622982e+06
3    1.164191e+06
4    7.633357e+05
Name: Amount_Spent, dtype: float64


Unnamed: 0,Age,Sex,country,Duration_of_Visit,Year,mode,purpose,quarter,where_contact_lives,visits,nights,Holiday_Package
0,0-15,Male,Austria,4-13 nights,2009,Air,Holiday,Jan-Mar,UK residents,1600.680374,11204.762616,Independent
1,0-15,Female,Austria,4-13 nights,2009,Air,Holiday,Jan-Mar,UK residents,1600.680374,11204.762616,Independent
2,16-24,Male,Austria,4-13 nights,2009,Air,Holiday,Jan-Mar,UK residents,3064.743058,20873.377956,Independent
3,16-24,Female,Austria,4-13 nights,2009,Air,Holiday,Jan-Mar,UK residents,2702.755561,12411.702616,Independent
4,16-24,Female,Austria,14-27 nights,2009,Air,Holiday,Jan-Mar,UK residents,525.351507,7354.921102,Independent


In [6]:
# Split our data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(features,
                                                          labels,
                                                          test_size=0.3,
                                                          random_state=42)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)


(221674, 12)
(95004, 12)
(221674,)
(95004,)


##### Pre-processing categorical variables

In [7]:
#For training data set 
X_train_cat = pd.DataFrame(X_train[['Age','Sex','country','Duration_of_Visit','mode',
                     'purpose','quarter','where_contact_lives']])
X_train_cat_encoded = pd.get_dummies(X_train_cat)
print(f" The shape of X_train_cat_encoded is: {X_train_cat_encoded.shape}")

#For testing data set
X_test_cat = pd.DataFrame(X_test[['Age','Sex','country','Duration_of_Visit','mode',
                                  'purpose','quarter','where_contact_lives']])
X_test_cat_encoded = pd.get_dummies(X_test_cat)
print(f" The shape of X_test_cat_encoded is: {X_test_cat_encoded.shape}")

 The shape of X_train_cat_encoded is: (221674, 90)
 The shape of X_test_cat_encoded is: (95004, 90)


##### Normalize the continuos variables

In [8]:
#normalise the continuous variables (and Year) for training set 
X_train_cont = X_train[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_train_cont)
X_train_cont_new = pd.DataFrame(x_scaled, columns=X_train_cont.columns, index=X_train_cont.index)
X_train_cont_new.head()

Unnamed: 0,visits,nights,Year
96106,0.413912,0.265616,0.25
96625,0.02173,0.135142,0.25
134745,0.084941,0.026785,0.333333
82810,0.043506,0.02548,0.166667
87092,0.103436,0.126274,0.166667


In [9]:
#normalise the continuous variables (and Year) for testing set 
X_test_cont = X_test[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_test_cont)
X_test_cont_new = pd.DataFrame(x_scaled, columns=X_test_cont.columns, index=X_test_cont.index)
X_test_cont_new.head()

Unnamed: 0,visits,nights,Year
44145,0.086519,0.067653,0.083333
21792,0.212311,0.017943,0.0
120381,0.043165,0.015846,0.333333
310057,0.059404,0.113076,1.0
269541,0.233586,0.169924,0.75


##### Add DataFrame of continuous variables and categorical variables together for a complete training set

In [10]:
train_data = X_train_cat_encoded.join(X_train_cont)
#Do the same for test data
test_data = X_test_cat_encoded.join(X_test_cont)
print(train_data.shape)
print(test_data.shape)

(221674, 93)
(95004, 93)


##### Fitting linear regression model for Amount spent

In [11]:
model=linear_model.LinearRegression()

In [12]:
model.fit(train_data , Y_train )

LinearRegression()

In [13]:
Y_pred =model.predict(test_data)

In [14]:
lin_reg=model.score(test_data, Y_test)

In [15]:
lin_reg_r2=r2_score(Y_test, Y_pred)

In [16]:
# Mean absolute error MAE, and Mean Squared Error MSE
mae_lr = round(metrics.mean_absolute_error(Y_test, Y_pred), 4)
mse_lr = round(metrics.mean_squared_error(Y_test, Y_pred), 4)
  

##### Fitting Random Forest Regression to the dataset

In [17]:

# import the regressor
from sklearn.ensemble import RandomForestRegressor
 
 # create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 42)
 
# fit the regressor with x and y data
regressor.fit(train_data , Y_train ) 

RandomForestRegressor(random_state=42)

In [18]:
Y_pred1=regressor.predict(test_data)

In [19]:
rfr =regressor.score(test_data , Y_test)

In [20]:
rfr_r2=r2_score(Y_test, Y_pred1)

In [21]:
mae_rfr = metrics.mean_absolute_error(Y_test, Y_pred1)
mse_rfr = metrics.mean_squared_error(Y_test, Y_pred1)

##### Comparing the two models

In [22]:
compare_models = pd.DataFrame(
    {  'Model' : ['Multiple Linear Regression', 'Random Forest Regression'],
        'r2_Score' : [lin_reg_r2, rfr_r2],
        'MAE'  : [mae_lr, mae_rfr],
        'MSE'  : [mse_lr, mse_rfr]
    })

In [23]:
print(compare_models)

                        Model  r2_Score            MAE           MSE
0  Multiple Linear Regression  0.504365  602585.128800  8.159056e+11
1    Random Forest Regression  0.564633  529740.591477  7.166924e+11


- We can now see the score and error of our models and compare them. Score of Random forest Regression is greater then Linear regression and error is also less.Thus, Random forest Regression will be the right choice for our model.

### Feature selection to make our Random forest Model better

##### Calculate the pearson coefficient between Amount spent and other  continuos variables

In [24]:
from scipy.stats import pearsonr

In [25]:
#rel between Amount spent and nights
Amount = data['Amount_Spent']
nights = data['nights']
corr = pearsonr(Amount , nights)
corr

(0.5123343084018617, 0.0)

In [26]:
#rel between Amount spent and visits
Amount = data['Amount_Spent']
visits = data['visits']
corr = pearsonr(Amount , visits)
corr

(0.5231494020402545, 0.0)

In [27]:
#rel between Amount spent and year
Amount = data['Amount_Spent']
year = data['Year']
corr = pearsonr(Amount , year)
corr

(0.08158186261723734, 0.0)

- Conclusion: A positive relationship exist between 'Amount spent and number of visits' and  'Amount spent and the number of  nights' and  'Year and the Amount spent'.

#### Calculate the relationship between the Amount spent and each of the categorical variable using covariance matrix

##### Correlation Matrix between Amount and Age

In [28]:
#Convert the categorical data to dummy
age=pd.get_dummies(data['Age'])
age.head()

Unnamed: 0,0-15,16-24,25-34,35-44,45-54,55-64,65 & over
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0


In [29]:
#Combine the continuos and categorical
amount_age=pd.concat([Amount,age] ,axis=1)

#Put in numpy array
x_age=amount_age.values
corr_matrix=np.corrcoef(x_age.T)

print(amount_age.columns)
print(corr_matrix)

Index(['Amount_Spent', '0-15', '16-24', '25-34', '35-44', '45-54', '55-64',
       '65 & over'],
      dtype='object')
[[ 1.         -0.07084424 -0.05281637  0.01376273  0.02912662  0.03652231
   0.01587742 -0.00645651]
 [-0.07084424  1.         -0.09806623 -0.12463462 -0.12228439 -0.11849331
  -0.10383129 -0.08244782]
 [-0.05281637 -0.09806623  1.         -0.19213764 -0.18851451 -0.18267016
  -0.16006707 -0.12710216]
 [ 0.01376273 -0.12463462 -0.19213764  1.         -0.23958741 -0.23215969
  -0.20343291 -0.16153704]
 [ 0.02912662 -0.12228439 -0.18851451 -0.23958741  1.         -0.22778187
  -0.19959679 -0.15849095]
 [ 0.03652231 -0.11849331 -0.18267016 -0.23215969 -0.22778187  1.
  -0.19340887 -0.15357739]
 [ 0.01587742 -0.10383129 -0.16006707 -0.20343291 -0.19959679 -0.19340887
   1.         -0.13457416]
 [-0.00645651 -0.08244782 -0.12710216 -0.16153704 -0.15849095 -0.15357739
  -0.13457416  1.        ]]


- Result: The amount spent has a weak negative correlation with ages 0-15 ,  16-24 and 65&over , While it has a weak positive correlation with the remaining ages, i.e ages 25-34 , 35-44 , 45-54 and 55-64. In other words , Amount spent is slightly increased if the individual is a working class , while it reduces with more non working class 

- Conclusion: Amount spent has a weak correlation with ages

##### Correlation Matrix between Amount and Sex

In [30]:
#Convert the categorical data to dummy
sex=pd.get_dummies(data['Sex'])
sex.head()

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,0,1
3,1,0
4,1,0


In [31]:
#Combine the continuos and categorical
amount_sex=pd.concat([Amount,sex] ,axis=1)

#Put in numpy array
x_sex=amount_sex.values
corr_matrix=np.corrcoef(x_sex.T)

print(amount_sex.columns)
print(corr_matrix)

Index(['Amount_Spent', 'Female', 'Male'], dtype='object')
[[ 1.         -0.04231468  0.04231468]
 [-0.04231468  1.         -1.        ]
 [ 0.04231468 -1.          1.        ]]


- Result: The Amount Spent has a weak negative relationship with the female sex, i.e for an increase in the female genger, 
    the amount spent decreases. Amount spent has a weak positive relationship eith the male gender
- Conclusion: Gender do not have much relation ship with the amount spent, as the weak negative relationship with the 
    female seems to be balanced by a weak positive relationship with the male gender
    

##### Correlation Matrix between Amount and Country

In [32]:
#Convert the categorical data to dummy
country=pd.get_dummies(data['country'])
print(country.head())
#Combine the continuos and categorical
amount_country=pd.concat([Amount,country] ,axis=1)

#Put in numpy array
x_country=amount_country.values
corr_matrix=np.corrcoef(x_country.T)

print(amount_country.columns)
print(corr_matrix)

   Australia  Austria  Barbados  Belgium  Bulgaria  Canada  China - Hong Kong  \
0          0        1         0        0         0       0                  0   
1          0        1         0        0         0       0                  0   
2          0        1         0        0         0       0                  0   
3          0        1         0        0         0       0                  0   
4          0        1         0        0         0       0                  0   

   China - Other  Croatia  Cyprus EU  ...  South Africa  Spain  Sri Lanka  \
0              0        0          0  ...             0      0          0   
1              0        0          0  ...             0      0          0   
2              0        0          0  ...             0      0          0   
3              0        0          0  ...             0      0          0   
4              0        0          0  ...             0      0          0   

   Sweden  Switzerland  Thailand  Tunisia  Turkey 

##### Correlation Matrix between Amount and Duration_of_Visit

In [33]:
#Convert the categorical data to dummy
Duration_of_Visit=pd.get_dummies(data['Duration_of_Visit'])
print(Duration_of_Visit.head())
#Combine the continuos and categorical
amount_duration=pd.concat([Amount,Duration_of_Visit] ,axis=1)

#Put in numpy array
x_duration=amount_duration.values
corr_matrix=np.corrcoef(x_duration.T)

print(amount_duration.columns)
print(corr_matrix)

   1-3 nights  14-27 nights  28-90 nights  3-6 months  4-13 nights  \
0           0             0             0           0            1   
1           0             0             0           0            1   
2           0             0             0           0            1   
3           0             0             0           0            1   
4           0             1             0           0            0   

   6 months-year  
0              0  
1              0  
2              0  
3              0  
4              0  
Index(['Amount_Spent', '1-3 nights', '14-27 nights', '28-90 nights',
       '3-6 months', '4-13 nights', '6 months-year'],
      dtype='object')
[[ 1.         -0.20565728  0.11439228  0.03919663  0.02894774  0.07308083
   0.01749842]
 [-0.20565728  1.         -0.31674222 -0.19508831 -0.0635727  -0.56101727
  -0.02017483]
 [ 0.11439228 -0.31674222  1.         -0.14155538 -0.04612813 -0.40707214
  -0.01463878]
 [ 0.03919663 -0.19508831 -0.14155538  1.         -0.

##### Correlation Matrix between Amount and Mode of transportation

In [34]:
#Convert the categorical data to dummy
mode=pd.get_dummies(data['mode'])
print(mode.head())
#Combine the continuos and categorical
amount_mode=pd.concat([Amount,mode] ,axis=1)

#Put in numpy array
x_mode=amount_mode.values
corr_matrix=np.corrcoef(x_duration.T)

print(amount_mode.columns)
print(corr_matrix)

   Air  Sea  Tunnel
0    1    0       0
1    1    0       0
2    1    0       0
3    1    0       0
4    1    0       0
Index(['Amount_Spent', 'Air', 'Sea', 'Tunnel'], dtype='object')
[[ 1.         -0.20565728  0.11439228  0.03919663  0.02894774  0.07308083
   0.01749842]
 [-0.20565728  1.         -0.31674222 -0.19508831 -0.0635727  -0.56101727
  -0.02017483]
 [ 0.11439228 -0.31674222  1.         -0.14155538 -0.04612813 -0.40707214
  -0.01463878]
 [ 0.03919663 -0.19508831 -0.14155538  1.         -0.0284113  -0.25072444
  -0.00901634]
 [ 0.02894774 -0.0635727  -0.04612813 -0.0284113   1.         -0.08170264
  -0.00293812]
 [ 0.07308083 -0.56101727 -0.40707214 -0.25072444 -0.08170264  1.
  -0.02592837]
 [ 0.01749842 -0.02017483 -0.01463878 -0.00901634 -0.00293812 -0.02592837
   1.        ]]


##### Correlation Matrix between Amount and purpose 

In [35]:
#Convert the categorical data to dummy
purpose=pd.get_dummies(data['purpose'])
print(purpose.head())
#Combine the continuos and categorical
amount_purpose=pd.concat([Amount,purpose] ,axis=1)

#Put in numpy array
x_purpose=amount_purpose.values
corr_matrix=np.corrcoef(x_purpose.T)

print(amount_purpose.columns)
print(corr_matrix)

   Business  Holiday  Miscellaneous  Study  VFR
0         0        1              0      0    0
1         0        1              0      0    0
2         0        1              0      0    0
3         0        1              0      0    0
4         0        1              0      0    0
Index(['Amount_Spent', 'Business', 'Holiday', 'Miscellaneous', 'Study', 'VFR'], dtype='object')
[[ 1.          0.06096714  0.13103451 -0.0853006   0.0377893  -0.14731598]
 [ 0.06096714  1.         -0.37400804 -0.13047674 -0.05768164 -0.30256514]
 [ 0.13103451 -0.37400804  1.         -0.25180328 -0.11131812 -0.58391169]
 [-0.0853006  -0.13047674 -0.25180328  1.         -0.03883453 -0.2037039 ]
 [ 0.0377893  -0.05768164 -0.11131812 -0.03883453  1.         -0.09005417]
 [-0.14731598 -0.30256514 -0.58391169 -0.2037039  -0.09005417  1.        ]]


##### Correlation Matrix between Amount and quarter

In [36]:
#Convert the categorical data to dummy
quarter=pd.get_dummies(data['quarter'])
print(quarter.head())
#Combine the continuos and categorical
amount_quarter=pd.concat([Amount,quarter] ,axis=1)

#Put in numpy array
x_quarter=amount_quarter.values
corr_matrix=np.corrcoef(x_quarter.T)

print(amount_quarter.columns)
print(corr_matrix)

   Apr-Jun  Jan-Mar  Jul-Sep  Oct-Dec
0        0        1        0        0
1        0        1        0        0
2        0        1        0        0
3        0        1        0        0
4        0        1        0        0
Index(['Amount_Spent', 'Apr-Jun', 'Jan-Mar', 'Jul-Sep', 'Oct-Dec'], dtype='object')
[[ 1.         -0.00823116 -0.03298149  0.03732658  0.00224328]
 [-0.00823116  1.         -0.32188122 -0.35383009 -0.32234614]
 [-0.03298149 -0.32188122  1.         -0.34368802 -0.31310651]
 [ 0.03732658 -0.35383009 -0.34368802  1.         -0.34418444]
 [ 0.00224328 -0.32234614 -0.31310651 -0.34418444  1.        ]]


##### Correlation Matrix between Amount and where contact lives

In [37]:
#Convert the categorical data to dummy
where_contact_lives=pd.get_dummies(data['where_contact_lives'])
print(where_contact_lives.head())
#Combine the continuos and categorical
amount_where_contact_lives=pd.concat([Amount,where_contact_lives] ,axis=1)

#Put in numpy array
x_where_contact_lives=amount_where_contact_lives.values
corr_matrix=np.corrcoef(x_where_contact_lives.T)

print(amount_where_contact_lives.columns)
print(corr_matrix)

   Overseas residents  UK residents
0                   0             1
1                   0             1
2                   0             1
3                   0             1
4                   0             1
Index(['Amount_Spent', 'Overseas residents', 'UK residents'], dtype='object')
[[ 1.         -0.09981649  0.09981649]
 [-0.09981649  1.         -1.        ]
 [ 0.09981649 -1.          1.        ]]


##### Correlation Matrix between Amount and Holiday Package

In [38]:
#Convert the categorical data to dummy
Holiday_Package=pd.get_dummies(data['Holiday_Package'])
print(Holiday_Package.head())
#Combine the continuos and categorical
amount_Holiday_Package=pd.concat([Amount,Holiday_Package] ,axis=1)

#Put in numpy array
x_Holiday_Package=amount_Holiday_Package.values
corr_matrix=np.corrcoef(x_Holiday_Package.T)

print(amount_Holiday_Package.columns)
print(corr_matrix)

   Independent  Non-Independent
0            1                0
1            1                0
2            1                0
3            1                0
4            1                0
Index(['Amount_Spent', 'Independent', 'Non-Independent'], dtype='object')
[[ 1.         -0.06117148  0.06117148]
 [-0.06117148  1.         -1.        ]
 [ 0.06117148 -1.          1.        ]]


### feature selection - Removing one of the continuos variable, Year

In [39]:
#target variable
labels1 = data['Amount_Spent']
#Include all other variables as feature variables
features1 = data[['Age','Sex','country','Duration_of_Visit','mode',
                     'purpose','quarter','where_contact_lives', 'visits','nights', 'Holiday_Package']]

print(f'label shape {labels1.shape}')
print('First Five sample dataset for labels')
print(f'features shape {features1.shape}')
print(labels1.head())
features1.head()

# Split our data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(features1,
                                                          labels1,
                                                          test_size=0.3,
                                                          random_state=42)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

#For training data set 
X_train_cat = pd.DataFrame(X_train[['Age','Sex','country','Duration_of_Visit','mode',
                     'purpose','quarter','where_contact_lives']])
X_train_cat_encoded = pd.get_dummies(X_train_cat)
print(f" The shape of X_train_cat_encoded is: {X_train_cat_encoded.shape}")

#For testing data set
X_test_cat = pd.DataFrame(X_test[['Age','Sex','country','Duration_of_Visit','mode',
                                  'purpose','quarter','where_contact_lives']])
X_test_cat_encoded = pd.get_dummies(X_test_cat)
print(f" The shape of X_test_cat_encoded is: {X_test_cat_encoded.shape}")


##### Pre-processing categorical variables
#normalise the continuous variables (and Year) for training set 
X_train_cont = X_train[['visits','nights']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_train_cont)
X_train_cont_new = pd.DataFrame(x_scaled, columns=X_train_cont.columns, index=X_train_cont.index)
X_train_cont_new.head()

#normalise the continuous variables (and Year) for testing set 
X_test_cont = X_test[['visits','nights']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_test_cont)
X_test_cont_new = pd.DataFrame(x_scaled, columns=X_test_cont.columns, index=X_test_cont.index)
X_test_cont_new.head()


##### Add DataFrame of continuous variables and categorical variables together for a complete training set
train_data = X_train_cat_encoded.join(X_train_cont)
#Do the same for test data
test_data = X_test_cat_encoded.join(X_test_cont)
print(train_data.shape)
print(test_data.shape)


##### Fitting Random Forest Regression to the dataset
 
 # create regressor object
regressor1 = RandomForestRegressor(n_estimators = 100, random_state = 42)
 
# fit the regressor with x and y data
regressor1.fit(train_data , Y_train ) 

### Predicting with the model fit using test data
Y_pred1=regressor1.predict(test_data)
rfr =regressor1.score(test_data , Y_test)
rfr_r2=r2_score(Y_test, Y_pred1)
#### Mean square error
mae_rfr = metrics.mean_absolute_error(Y_test, Y_pred1)
mse_rfr = metrics.mean_squared_error(Y_test, Y_pred1)

label shape (316678,)
First Five sample dataset for labels
features shape (316678, 11)
0    1.103402e+06
1    1.125278e+06
2    1.622982e+06
3    1.164191e+06
4    7.633357e+05
Name: Amount_Spent, dtype: float64
(221674, 11)
(95004, 11)
(221674,)
(95004,)
 The shape of X_train_cat_encoded is: (221674, 90)
 The shape of X_test_cat_encoded is: (95004, 90)
(221674, 92)
(95004, 92)


In [40]:
print(f'Model R-Squared score , Mean Absolute Error  and Mean Square Error after removing Year')
print('\n')
model_result = pd.DataFrame(
    {  'Model' : ['Random Forest Regression'],
        'r2_Score' : [ rfr_r2],
        'MAE'  : [mae_rfr],
        'MSE'  : [mse_rfr]
    })
print(model_result)

Model R-Squared score , Mean Absolute Error  and Mean Square Error after removing Year


                      Model  r2_Score            MAE           MSE
0  Random Forest Regression  0.557524  534342.744479  7.283966e+11


### feature selection - Removing one of categorical variable,  Sex

In [None]:
#target variable
labels1 = data['Amount_Spent']
#Include all other variables as feature variables
features1 = data[['Age','country','Duration_of_Visit','Year','mode',
                     'purpose','quarter','where_contact_lives', 'visits','nights', 'Holiday_Package']]

print(f'label shape {labels1.shape}')
print('First Five sample dataset for labels')
print(f'features shape {features1.shape}')
print(labels1.head())
features1.head()

# Split our data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(features1,
                                                          labels1,
                                                          test_size=0.3,
                                                          random_state=42)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

#For training data set 
X_train_cat = pd.DataFrame(X_train[['Age','country','Duration_of_Visit','mode',
                     'purpose','quarter','where_contact_lives']])
X_train_cat_encoded = pd.get_dummies(X_train_cat)
print(f" The shape of X_train_cat_encoded is: {X_train_cat_encoded.shape}")

#For testing data set
X_test_cat = pd.DataFrame(X_test[['Age','country','Duration_of_Visit','mode',
                                  'purpose','quarter','where_contact_lives']])
X_test_cat_encoded = pd.get_dummies(X_test_cat)
print(f" The shape of X_test_cat_encoded is: {X_test_cat_encoded.shape}")


##### Pre-processing categorical variables
#normalise the continuous variables (and Year) for training set 
X_train_cont = X_train[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_train_cont)
X_train_cont_new = pd.DataFrame(x_scaled, columns=X_train_cont.columns, index=X_train_cont.index)
X_train_cont_new.head()

#normalise the continuous variables (and Year) for testing set 
X_test_cont = X_test[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_test_cont)
X_test_cont_new = pd.DataFrame(x_scaled, columns=X_test_cont.columns, index=X_test_cont.index)
X_test_cont_new.head()


##### Add DataFrame of continuous variables and categorical variables together for a complete training set
train_data = X_train_cat_encoded.join(X_train_cont)
#Do the same for test data
test_data = X_test_cat_encoded.join(X_test_cont)
print(train_data.shape)
print(test_data.shape)


##### Fitting Random Forest Regression to the dataset
 
 # create regressor object
regressor2 = RandomForestRegressor(n_estimators = 100, random_state = 42)
 
# fit the regressor with x and y data
regressor2.fit(train_data , Y_train ) 

### Predicting with the model fit using test data
Y_pred1=regressor2.predict(test_data)
rfr =regressor2.score(test_data , Y_test)
rfr_r2=r2_score(Y_test, Y_pred1)
#### Mean square error
mae_rfr = metrics.mean_absolute_error(Y_test, Y_pred1)
mse_rfr = metrics.mean_squared_error(Y_test, Y_pred1)

In [None]:
print(f'Model r Squared score , Mean Absolute Error  and Mean Square Error after removing Sex')
model_result = pd.DataFrame(
    {  'Model' : ['Random Forest Regression'],
        'r2_Score' : [ rfr_r2],
        'MAE'  : [mae_rfr],
        'MSE'  : [mse_rfr]
    })
print(model_result)

### feature selection - Removing one of categorical variable,  purpose

In [48]:
#target variable
labels1 = data['Amount_Spent']
#Include all other variables as feature variables
features1 = data[['Age','country','Duration_of_Visit','Year','mode',
                     'Sex','quarter','where_contact_lives', 'visits','nights', 'Holiday_Package']]

print(f'label shape {labels1.shape}')
print('First Five sample dataset for labels')
print(f'features shape {features1.shape}')
print(labels1.head())
features1.head()

# Split our data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(features1,
                                                          labels1,
                                                          test_size=0.3,
                                                          random_state=42)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

#For training data set 
X_train_cat = pd.DataFrame(X_train[['Age','country','Duration_of_Visit','mode',
                     'Sex','quarter','where_contact_lives']])
X_train_cat_encoded = pd.get_dummies(X_train_cat)
print(f" The shape of X_train_cat_encoded is: {X_train_cat_encoded.shape}")

#For testing data set
X_test_cat = pd.DataFrame(X_test[['Age','country','Duration_of_Visit','mode',
                                  'Sex','quarter','where_contact_lives']])
X_test_cat_encoded = pd.get_dummies(X_test_cat)
print(f" The shape of X_test_cat_encoded is: {X_test_cat_encoded.shape}")


##### Pre-processing categorical variables
#normalise the continuous variables (and Year) for training set 
X_train_cont = X_train[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_train_cont)
X_train_cont_new = pd.DataFrame(x_scaled, columns=X_train_cont.columns, index=X_train_cont.index)
X_train_cont_new.head()

#normalise the continuous variables (and Year) for testing set 
X_test_cont = X_test[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_test_cont)
X_test_cont_new = pd.DataFrame(x_scaled, columns=X_test_cont.columns, index=X_test_cont.index)
X_test_cont_new.head()


##### Add DataFrame of continuous variables and categorical variables together for a complete training set
train_data = X_train_cat_encoded.join(X_train_cont)
#Do the same for test data
test_data = X_test_cat_encoded.join(X_test_cont)
print(train_data.shape)
print(test_data.shape)


##### Fitting Random Forest Regression to the dataset
 
 # create regressor object
regressor3 = RandomForestRegressor(n_estimators = 100, random_state = 42)
 
# fit the regressor with x and y data
regressor3.fit(train_data , Y_train ) 

### Predicting with the model fit using test data
Y_pred1=regressor3.predict(test_data)
rfr =regressor3.score(test_data , Y_test)
rfr_r2=r2_score(Y_test, Y_pred1)
#### Mean square error
mae_rfr = metrics.mean_absolute_error(Y_test, Y_pred1)
mse_rfr = metrics.mean_squared_error(Y_test, Y_pred1)

label shape (206222,)
First Five sample dataset for labels
features shape (206222, 11)
0    1.103402e+06
1    1.125278e+06
2    1.622982e+06
3    1.164191e+06
4    7.633357e+05
Name: Amount_Spent, dtype: float64
(144355, 11)
(61867, 11)
(144355,)
(61867,)
 The shape of X_train_cat_encoded is: (144355, 59)
 The shape of X_test_cat_encoded is: (61867, 59)
(144355, 62)
(61867, 62)


In [49]:
print(f'Model r squared score , Mean Absolute Error  and Mean Square Error after removing Purpose')
model_result = pd.DataFrame(
    {  'Model' : ['Random Forest Regression'],
        'r2_Score' : [ rfr_r2],
        'MAE'  : [mae_rfr],
        'MSE'  : [mse_rfr]
    })
print(model_result)

Model r squared score , Mean Absolute Error  and Mean Square Error after removing Purpose
                      Model  r2_Score            MAE           MSE
0  Random Forest Regression  0.510698  479732.107285  5.491521e+11


### feature selection - Removing one of categorical variable, Where_contact_lives

In [51]:
#target variable
labels1 = data['Amount_Spent']
#Include all other variables as feature variables
features1 = data[['Age','country','Duration_of_Visit','Year','mode',
                     'purpose','quarter','Sex', 'visits','nights', 'Holiday_Package']]

print(f'label shape {labels1.shape}')
print('First Five sample dataset for labels')
print(f'features shape {features1.shape}')
print(labels1.head())
features1.head()

# Split our data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(features1,
                                                          labels1,
                                                          test_size=0.3,
                                                          random_state=42)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

#For training data set 
X_train_cat = pd.DataFrame(X_train[['Age','country','Duration_of_Visit','mode',
                     'purpose','quarter','Sex']])
X_train_cat_encoded = pd.get_dummies(X_train_cat)
print(f" The shape of X_train_cat_encoded is: {X_train_cat_encoded.shape}")

#For testing data set
X_test_cat = pd.DataFrame(X_test[['Age','country','Duration_of_Visit','mode',
                                  'purpose','quarter','Sex']])
X_test_cat_encoded = pd.get_dummies(X_test_cat)
print(f" The shape of X_test_cat_encoded is: {X_test_cat_encoded.shape}")


##### Pre-processing categorical variables
#normalise the continuous variables (and Year) for training set 
X_train_cont = X_train[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_train_cont)
X_train_cont_new = pd.DataFrame(x_scaled, columns=X_train_cont.columns, index=X_train_cont.index)
X_train_cont_new.head()

#normalise the continuous variables (and Year) for testing set 
X_test_cont = X_test[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_test_cont)
X_test_cont_new = pd.DataFrame(x_scaled, columns=X_test_cont.columns, index=X_test_cont.index)
X_test_cont_new.head()


##### Add DataFrame of continuous variables and categorical variables together for a complete training set
train_data = X_train_cat_encoded.join(X_train_cont)
#Do the same for test data
test_data = X_test_cat_encoded.join(X_test_cont)
print(train_data.shape)
print(test_data.shape)


##### Fitting Random Forest Regression to the dataset
 
 # create regressor object
regressor4 = RandomForestRegressor(n_estimators = 100, random_state = 42)
 
# fit the regressor with x and y data
regressor4.fit(train_data , Y_train ) 

### Predicting with the model fit using test data
Y_pred1=regressor4.predict(test_data)
rfr =regressor4.score(test_data , Y_test)
rfr_r2=r2_score(Y_test, Y_pred1)
#### Mean square error
mae_rfr = metrics.mean_absolute_error(Y_test, Y_pred1)
mse_rfr = metrics.mean_squared_error(Y_test, Y_pred1)

label shape (206222,)
First Five sample dataset for labels
features shape (206222, 11)
0    1.103402e+06
1    1.125278e+06
2    1.622982e+06
3    1.164191e+06
4    7.633357e+05
Name: Amount_Spent, dtype: float64
(144355, 11)
(61867, 11)
(144355,)
(61867,)
 The shape of X_train_cat_encoded is: (144355, 62)
 The shape of X_test_cat_encoded is: (61867, 62)
(144355, 65)
(61867, 65)


In [1]:
print(f'Model r squared score , Mean Absolute Error  and Mean Square Error after removing the column "Where contact lives"')
model_result = pd.DataFrame(
    {  'Model' : ['Random Forest Regression'],
        'r2_Score' : [ rfr_r2],
        'MAE'  : [mae_rfr],
        'MSE'  : [mse_rfr]
    })
print(model_result)

Model r squared score , Mean Absolute Error  and Mean Square Error after removing the column "Where contact lives"


NameError: name 'pd' is not defined

### feature selection - Removing one of categorical variable, country

In [None]:
#target variable
labels1 = data['Amount_Spent']
#Include all other variables as feature variables
features1 = data[['Age','where_contact_lives','Duration_of_Visit','Year','mode',
                     'purpose','quarter','Sex', 'visits','nights', 'Holiday_Package']]

print(f'label shape {labels1.shape}')
print('First Five sample dataset for labels')
print(f'features shape {features1.shape}')
print(labels1.head())
features1.head()

# Split our data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(features1,
                                                          labels1,
                                                          test_size=0.3,
                                                          random_state=42)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

#For training data set 
X_train_cat = pd.DataFrame(X_train[['Age','where_contact_lives','Duration_of_Visit','mode',
                     'purpose','quarter','Sex']])
X_train_cat_encoded = pd.get_dummies(X_train_cat)
print(f" The shape of X_train_cat_encoded is: {X_train_cat_encoded.shape}")

#For testing data set
X_test_cat = pd.DataFrame(X_test[['Age','where_contact_lives','Duration_of_Visit','mode',
                                  'purpose','quarter','Sex']])
X_test_cat_encoded = pd.get_dummies(X_test_cat)
print(f" The shape of X_test_cat_encoded is: {X_test_cat_encoded.shape}")


##### Pre-processing categorical variables
#normalise the continuous variables (and Year) for training set 
X_train_cont = X_train[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_train_cont)
X_train_cont_new = pd.DataFrame(x_scaled, columns=X_train_cont.columns, index=X_train_cont.index)
X_train_cont_new.head()

#normalise the continuous variables (and Year) for testing set 
X_test_cont = X_test[['visits','nights', 'Year']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_test_cont)
X_test_cont_new = pd.DataFrame(x_scaled, columns=X_test_cont.columns, index=X_test_cont.index)
X_test_cont_new.head()


##### Add DataFrame of continuous variables and categorical variables together for a complete training set
train_data = X_train_cat_encoded.join(X_train_cont)
#Do the same for test data
test_data = X_test_cat_encoded.join(X_test_cont)
print(train_data.shape)
print(test_data.shape)


##### Fitting Random Forest Regression to the dataset
 
 # create regressor object
regressor5 = RandomForestRegressor(n_estimators = 200, random_state = 42)
 
# fit the regressor with x and y data
regressor5.fit(train_data , Y_train ) 

### Predicting with the model fit using test data
Y_pred1=regressor5.predict(test_data)
rfr =regressor5.score(test_data , Y_test)
rfr_r2=r2_score(Y_test, Y_pred1)
#### Mean square error
mae_rfr = metrics.mean_absolute_error(Y_test, Y_pred1)
mse_rfr = metrics.mean_squared_error(Y_test, Y_pred1)

label shape (206222,)
First Five sample dataset for labels
features shape (206222, 11)
0    1.103402e+06
1    1.125278e+06
2    1.622982e+06
3    1.164191e+06
4    7.633357e+05
Name: Amount_Spent, dtype: float64
(144355, 11)
(61867, 11)
(144355,)
(61867,)
 The shape of X_train_cat_encoded is: (144355, 29)
 The shape of X_test_cat_encoded is: (61867, 29)
(144355, 32)
(61867, 32)


In [None]:
print(f'Model r squared score , Mean Absolute Error  and Mean Square Error after removing the column "country"')
model_result = pd.DataFrame(
    {  'Model' : ['Random Forest Regression'],
        'r2_Score' : [ rfr_r2],
        'MAE'  : [mae_rfr],
        'MSE'  : [mse_rfr]
    })
print(model_result)