In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

### Reading data

In [42]:
data = pd.read_excel('Travelpac_UK_Resident.xlsx', 'Sheet1')
data.shape
data.head()

Unnamed: 0,Year,quarter,where_contact_lives,mode,country,purpose,Holiday_Package,Age,Sex,Duration_of_Visit,visits,nights,Amount_Spent
0,2009,Jan-Mar,UK residents,Air,Austria,Holiday,Independent,0-15,Male,4-13 nights,1600.680374,11204.762616,1103402.0
1,2009,Jan-Mar,UK residents,Air,Austria,Holiday,Independent,0-15,Female,4-13 nights,1600.680374,11204.762616,1125278.0
2,2009,Jan-Mar,UK residents,Air,Austria,Holiday,Independent,16-24,Male,4-13 nights,3064.743058,20873.377956,1622982.0
3,2009,Jan-Mar,UK residents,Air,Austria,Holiday,Independent,16-24,Female,4-13 nights,2702.755561,12411.702616,1164191.0
4,2009,Jan-Mar,UK residents,Air,Austria,Holiday,Independent,16-24,Female,14-27 nights,525.351507,7354.921102,763335.7


### Splittin data into target and feature variables

In [43]:
#target variable
label = data['Holiday_Package']
#Include all other variables as feature variables
features = data[['Age','Sex','country','Duration_of_Visit','Year','mode',
                     'purpose','quarter','where_contact_lives', 'visits','nights', 'Amount_Spent']]

print(label.shape)
print(features.shape)
label.head()
features.head()

(139268,)
(139268, 12)


Unnamed: 0,Age,Sex,country,Duration_of_Visit,Year,mode,purpose,quarter,where_contact_lives,visits,nights,Amount_Spent
0,0-15,Male,Austria,4-13 nights,2009,Air,Holiday,Jan-Mar,UK residents,1600.680374,11204.762616,1103402.0
1,0-15,Female,Austria,4-13 nights,2009,Air,Holiday,Jan-Mar,UK residents,1600.680374,11204.762616,1125278.0
2,16-24,Male,Austria,4-13 nights,2009,Air,Holiday,Jan-Mar,UK residents,3064.743058,20873.377956,1622982.0
3,16-24,Female,Austria,4-13 nights,2009,Air,Holiday,Jan-Mar,UK residents,2702.755561,12411.702616,1164191.0
4,16-24,Female,Austria,14-27 nights,2009,Air,Holiday,Jan-Mar,UK residents,525.351507,7354.921102,763335.7


In [10]:
# Split our data into test and train
train_data, test_data, train_label_data, test_label_data = train_test_split(features,
                                                          label,
                                                          test_size=0.2,
                                                          random_state=42)

print(train_data.shape)
print(test_data.shape)
print(train_label_data.shape)
print(test_label_data.shape)

(111414, 12)
(27854, 12)
(111414,)
(27854,)


In [16]:
#Pre-processing categorical variables

#For training data set 
sample_data_train = pd.DataFrame(train_data[['Age','Sex','country','Duration_of_Visit','mode','purpose','quarter']])
sample_data_train_encoded = pd.get_dummies(sample_data_train)

print(sample_data_train_encoded.shape)

#For testing data set
sample_data_test = pd.DataFrame(test_data[['Age','Sex','country','Duration_of_Visit','mode','purpose','quarter']])

sample_data_test_encoded = pd.get_dummies(sample_data_test)
sample_data_test_encoded = sample_data_test_encoded.reindex(columns = sample_data_train_encoded.columns).fillna(0)

print(sample_data_test_encoded.shape)
sample_data_train_encoded.head()

(111414, 63)
(27854, 63)


Unnamed: 0,Age_0-15,Age_16-24,Age_25-34,Age_35-44,Age_45-54,Age_55-64,Age_65 & over,Sex_Female,Sex_Male,country_Austria,...,mode_Tunnel,purpose_Business,purpose_Holiday,purpose_Miscellaneous,purpose_Study,purpose_VFR,quarter_Apr-Jun,quarter_Jan-Mar,quarter_Jul-Sep,quarter_Oct-Dec
4343,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
136595,0,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
25941,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
105010,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
67614,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,1,1,0,0,0


In [20]:
#normalise the continuous variables (and Year) for training set 
train_continuous_data = train_data[['visits','nights','Amount_Spent', 'Year']]


min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(train_continuous_data)

train_cont_new = pd.DataFrame(x_scaled, columns=train_continuous_data.columns, index=train_continuous_data.index)

train_cont_new.head()

Unnamed: 0,visits,nights,Amount_Spent,Year
4343,0.015123,0.019447,0.005061,0.0
136595,0.001944,0.024709,0.007495,1.0
25941,0.002418,0.000556,0.00016,0.166667
105010,0.012149,0.006465,0.002671,0.666667
67614,0.005129,0.000919,0.000513,0.416667


In [22]:
#normalise the continuous variables (and Year) for testing set 
test_continuous = test_data[['visits','nights','Amount_Spent', 'Year']]

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(test_continuous)

test_cont_new = pd.DataFrame(x_scaled, columns=test_continuous.columns, index=test_continuous.index)

test_cont_new.head()

Unnamed: 0,visits,nights,Amount_Spent,Year
137735,0.011002,0.012554,0.016173,1.0
106864,0.013865,0.004101,0.00376,0.666667
7467,0.002506,0.001191,0.000437,0.0
122545,0.003642,0.001687,0.003479,0.833333
68533,0.024496,0.009171,0.007533,0.416667


In [23]:
#Add DataFrame of continuous variables and categorical variables together for a complete training set
train_new = sample_data_train_encoded.join(train_cont_new)
#Do the same for test data
test_new = sample_data_test_encoded.join(test_cont_new)

print(train_new.shape)
print(test_new.shape)

(111414, 67)
(27854, 67)


### Initial Accuracy Test for Potential Models using K fold Evluation

In [1]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='lbfgs', max_iter=250000)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt

train_new_1 = train_new.values
train_labels_1 = train_label_data.values

seed = 7

models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))


#Evaluate each model
results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state = seed , shuffle=True)
    cv_results = model_selection.cross_val_score(model, train_new_1, train_label_data.ravel(), cv=kfold, scoring=scoring)
    
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    
    print(msg)
    
#boxplot for comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

NameError: name 'train_new' is not defined

Having applied KFold Validation to a number of possible classification algorithms, it seems that we have a strong accuracy rating from the following models:
###### Logistic Regression , Random Forests , K Nearest Neighbour , Decision Tree Classifier

since we will select Random Forest because it has the highest accuracy


Unnamed: 0,Age_0-15,Age_16-24,Age_25-34,Age_35-44,Age_45-54,Age_55-64,Age_65 & over,Sex_Female,Sex_Male,country_Austria,...,purpose_Study,purpose_VFR,quarter_Apr-Jun,quarter_Jan-Mar,quarter_Jul-Sep,quarter_Oct-Dec,visits,nights,Amount_Spent,Year
110268,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0.007035,0.002509,0.000518,0.666667
119879,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0.022288,0.015854,0.00742,0.833333
103694,0,0,0,1,0,0,0,1,0,0,...,0,1,1,0,0,0,0.004877,0.005247,0.000452,0.666667
131932,0,0,0,0,0,0,1,0,1,0,...,0,1,1,0,0,0,0.000411,0.000661,0.000401,1.0
121958,0,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0.065178,0.015918,0.006394,0.833333
