In [7]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

data = pd.read_csv('https://raw.githubusercontent.com/kavishkadinajara/TRAVELGENIUS/master/ML_model/travel_data.csv')
data

Unnamed: 0,Number_of_Travelers,Budget,Area_of_Interest,Preferred_Climate,Transportation_Mode,Recommendation
0,6,35000,Beach,Warm,Bikes,"Marble Beach, Trincomalee"
1,4,80000,Forest,Tropical,Public Transport,"Sinharaja Forest Reserve, Deniyaya"
2,2,70000,City,Moderate,Van,"Kurunegala City, Kurunegala"
3,6,17000,Beach,Warm,Bikes,"Jungle Beach, Unawatuna"
4,25,55000,Forest,Tropical,Public Transport,"Hiyare Reservoir, Galle"
...,...,...,...,...,...,...
5744,12,25000,Forest,Tropical,Bikes,"Hiyare Reservoir, Galle"
5745,4,48000,Desert,Hot,Rosa Bus,"Delft Island, Jaffna"
5746,1,15000,National Park,Warm,Rosa Bus,"Kaudulla National Park, Polonnaruwa"
5747,20,90000,National Park,Warm,Bikes,"Wilpattu National Park, Wilpattu"


In [10]:
data.shape

(5749, 6)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5749 entries, 0 to 5748
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Number_of_Travelers  5749 non-null   int64 
 1   Budget               5749 non-null   int64 
 2   Area_of_Interest     5749 non-null   object
 3   Preferred_Climate    5749 non-null   object
 4   Transportation_Mode  5749 non-null   object
 5   Recommendation       5749 non-null   object
dtypes: int64(2), object(4)
memory usage: 269.6+ KB


In [12]:
data["Recommendation"].value_counts()

Recommendation
Delft Island, Jaffna                         755
Kaudulla National Park, Polonnaruwa          101
Kurunegala City, Kurunegala                   99
Galway�s Land National Park, Nuwara Eliya     98
Hikkaduwa National Park, Hikkaduwa            96
                                            ... 
Nine Arches Bridge, Ella                      29
Uppuveli Beach, Trincomalee                   28
Unawatuna Beach, Galle                        25
Trincomalee Beach, Trincomalee                21
Nilaveli Beach, Trincomalee                   19
Name: count, Length: 89, dtype: int64

Encode Categorical Variables

In [13]:
data['Area_of_Interest']

0               Beach
1              Forest
2                City
3               Beach
4              Forest
            ...      
5744           Forest
5745           Desert
5746    National Park
5747    National Park
5748           Forest
Name: Area_of_Interest, Length: 5749, dtype: object

In [14]:
from sklearn.preprocessing import LabelEncoder

# Dictionary to hold label binarizers for each specified column
label_encoders = {}

# List of columns to be label binarized
columns_to_binarize = ['Area_of_Interest', 'Preferred_Climate', 'Transportation_Mode', 'Recommendation']

# Apply LabelBinarizer to each specified column
for column in columns_to_binarize:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [15]:
# Handle the result_category column separately
result_category = data['Recommendation']
result_binarizer = LabelEncoder()
result = result_binarizer.fit_transform(result_category)

In [16]:
result_category

0       52
1       74
2       45
3       34
4       29
        ..
5744    29
5745    14
5746    39
5747    87
5748    29
Name: Recommendation, Length: 5749, dtype: int64

In [17]:
label_encoders

{'Area_of_Interest': LabelEncoder(),
 'Preferred_Climate': LabelEncoder(),
 'Transportation_Mode': LabelEncoder(),
 'Recommendation': LabelEncoder()}

In [18]:
# Output the classes for the result_category
print(result_binarizer.classes_)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88]


## Split the **Dataset**

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x = data.drop(columns=['Recommendation'])
y = data['Recommendation']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [21]:
x_train.head()

Unnamed: 0,Number_of_Travelers,Budget,Area_of_Interest,Preferred_Climate,Transportation_Mode
3614,19,130000,3,3,0
4153,9,30000,6,0,0
3866,3,45000,1,2,2
2277,5,170000,4,2,4
677,10,13000,3,3,4


In [22]:
x_test.shape

(1150, 5)

## Normalization the Features


In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4
0,0.750000,0.648064,0.428571,0.75,0.00
1,0.333333,0.145299,0.857143,0.00,0.00
2,0.083333,0.220714,0.142857,0.50,0.50
3,0.166667,0.849170,0.571429,0.50,1.00
4,0.375000,0.059829,0.428571,0.75,1.00
...,...,...,...,...,...
4594,0.791667,0.547511,0.571429,0.50,0.50
4595,0.083333,0.597788,1.000000,1.00,0.25
4596,0.791667,0.185520,0.714286,0.50,0.50
4597,0.375000,0.346405,0.285714,0.25,0.75


## Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=50)
model_rf.fit(x_train, y_train)

In [25]:
predict_rf = model_rf.predict(x_test)
predict_rf

array([ 0, 70, 14, ..., 11, 19, 45])

In [26]:
y_test

4034     6
5241    70
2201    14
4211    14
2671    31
        ..
2366    11
2323    14
1831    54
5655    74
2795    45
Name: Recommendation, Length: 1150, dtype: int64

In [27]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_rf = accuracy_score(y_test, predict_rf)
report_rf = classification_report(y_test, predict_rf)

print(f'Accuracy: {accuracy_rf}')
print(f'Classification Report:\n{report_rf}')


Accuracy: 0.2843478260869565
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.00      0.00      0.00        12
           2       0.23      0.23      0.23        13
           3       0.00      0.00      0.00         8
           4       0.25      0.31      0.28        13
           5       0.06      0.06      0.06        18
           6       0.25      0.09      0.13        11
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00         7
           9       0.00      0.00      0.00        10
          10       0.12      0.17      0.14        12
          11       0.22      0.28      0.24        18
          12       0.20      0.33      0.25         9
          13       0.00      0.00      0.00        10
          14       1.00      1.00      1.00       151
          15       0.00      0.00      0.00         5
          16       0.00      

In [37]:
param_grid = {
    "n_neighbors": [5, 10, 15, 20, 25, 30],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [10, 20, 30, 40, 50],
    "p": [1, 2],
    "metric": ["euclidean", "manhattan", "minkowski"],
    "n_jobs": [None, -1]
}




In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [40]:
model_knn = KNeighborsClassifier()

In [41]:
grid_search = GridSearchCV(estimator=model_knn, param_grid=param_grid)
grid_search.fit(x_train, y_train)


## Decision_Tree

In [9]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

iris = load_iris()
X, y = iris.data, iris.target


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_knn = KNeighborsClassifier()
model_knn.fit(x_train, y_train)


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_dt = DecisionTreeClassifier()
model_dt.fit(x_train, y_train)

predict_dt = model_dt.predict(x_test)
print(predict_dt)


[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [12]:
y_test

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [13]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_dt = accuracy_score(y_test, predict_dt)
report_dt = classification_report(y_test, predict_dt)

print(f'Accuracy: {accuracy_dt}')
print(f'Classification Report:\n{report_dt}')

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

