# Travel Destination Recommendation System
                                                                     -by Kartik joshi
- Version 1: Recommendations based on Previously Visited Destination
- Version 2: Recommendations based on Free Text-based Query from the User

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#df = pd.read_csv('final_dataset_wo_duplicates.csv')
df = pd.read_csv('travel_destinations.csv')
df

# Create index_destination_dict

In [None]:
index_destination_dict = {}
for i in range(len(df)):
    index_destination_dict[i] = df.loc[i]['City']
index_destination_dict

# Create destination_index_dict

In [None]:
destination_index_dict = {}
for i in range(len(df)):
    destination_index_dict[df.loc[i]['City']] = i
destination_index_dict

In [None]:
df.columns

In [None]:
df.drop(['City', 'Tags', 'State', 'Old_age', 'Young_age', 'link 1',
       'Avg Expense Per Day', 'historical & heritage', 'city', 'pilgrimage',
       'hill station', 'beach', 'lake & backwater', 'adventure / trekking',
       'wildlife', 'waterfall', 'nature & scenic', 'keys'], axis = 1, inplace = True)
df

# Text Preprocessing

In [12]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [None]:
corpus = []
for i in range(len(df)):
    # Remove Hyperlinks
    destination = re.sub(r"http\S+", ' ', df['description'][i])
    
    # Remove Punctuation Marks and Special Symbols
    destination = re.sub('[^a-zA-Z0-9]', ' ', destination)
    
    # Lowercase
    destination = destination.lower()
    
    # Tokenization
    word_list = destination.split()
    
    # Remove Stopwords and perform Stemming / Lemmatization
    word_list = [wl.lemmatize(word, pos = 'v') for word in word_list if not word in stopwords.words('english')]    
    destination = ' '.join(word_list)
    corpus.append(destination)
    print(i, end = ' ')

In [None]:
corpus[0]

In [None]:
df['Processed Text'] = corpus
df

# Text to Vector Conversion

In [None]:
# - Count Vectorizer
# - TFIDF Vectorizer
# - Hashing Vectorizer
# - Glove
# - FastText
# - Word2Vec
# - BERT
# ...
# ...
# ...

# Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
vectors_cv = cv.fit_transform(corpus)
df['vectors_cv'] = list(vectors_cv.toarray())
df

# TFIDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
vectors_tv = tv.fit_transform(corpus)
df['vectors_tv'] = list(vectors_tv.toarray())
df

# Hashing Vectorizer

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer()
vectors_hv = hv.fit_transform(corpus)
df['vectors_hv'] = list(vectors_hv.toarray())
df

# Version 1: Recommendations based on Recently Visited Destination

In [None]:
destination = input('Enter the recently visited destination: ')
number_of_recommendations = int(input('How many recommendations do you want? '))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(vectors_cv.toarray())
idx = destination_index_dict[destination]
similarity_list = similarity_matrix[idx]
lst = []
for i in range(len(similarity_list)):
    lst.append((similarity_list[i], i))
lst.sort(reverse = True)
recommendations = []
for i in range(len(lst)):
    recommendations.append(index_destination_dict[lst[i][1]])
recommendations.remove(destination)
recommendations[:number_of_recommendations]

In [None]:
index_destination_dict = {}
for i in range(len(df)):
    index_destination_dict[i] = df.loc[i]['City']
index_destination_dict

In [None]:
destination_index_dict = {}
for i in range(len(df)):
    destination_index_dict[df.loc[i]['City']] = i 
destination_index_dict

In [None]:
df

In [None]:
df.columns

In [None]:
df.drop(['City', 'Tags', 'State', 'Old_age', 'Young_age', 'link 1',
       'Avg Expense Per Day','keys', 'description'], axis = 1, inplace = True)
df

In [None]:
destination_tags = df.to_numpy()
destination_tags

In [None]:
destination_tags.shape

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel
correlationMatrix = sigmoid_kernel(destination_tags, destination_tags)
print(correlationMatrix)

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# correlationMatrix = cosine_similarity(destination_tags, destination_tags)
# print(correlationMatrix)

In [None]:
correlationMatrix.shape

# Recommendation based on Recently Visited Destination

In [14]:
def getRecommendations(destination_name):
    idx = destination_index_dict[destination_name]
    similarity_list = correlationMatrix[idx]
    lst = []
    for i in range(len(similarity_list)):
        lst.append((similarity_list[i],i))
    return sorted(lst, reverse = True)

In [None]:
recommendations_list = getRecommendations('Shimla')
for element in recommendations_list[:10]:
    print(index_destination_dict[element[1]])

# Recommendation based on free text-based query

In [None]:
df = pd.read_csv('travel_destinations.csv')
df

In [None]:
df['description']

# Create corpus of processed text

In [20]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [None]:
from nltk.corpus import stopwords
len(stopwords.words('english'))

In [None]:
corpus = []
for i in range(len(df)):
    #Remove Hyperlinks
    destination = re.sub(r"http\S+", ' ', df['description'][i])    
    #destination = re.sub(r'https?:\/\/\S*', '', df['description'][i], flags=re.MULTILINE)
    
    #Remove Punctuation Marks and Special Symbols
    destination = re.sub('[^a-zA-Z0-9]', ' ', destination)
    
    #Lowercase
    destination = destination.lower()
    
    #Create a list of strings using string.split() method
    destination = destination.split()
    
    # Prefer Lemmatization over Stemming
    #destination = [ps.stem(word) for word in destination if not word in stopwords.words('english')]
    destination = [wl.lemmatize(word, pos='v') for word in destination if not word in stopwords.words('english')]    
    destination = ' '.join(destination)
    corpus.append(destination)
    print(i, end = ' ')

In [None]:
df.columns

In [None]:
df.drop(['Tags', 'State', 'Old_age', 'Young_age', 'link 1',
       'Avg Expense Per Day', 'historical & heritage', 'city', 'pilgrimage',
       'hill station', 'beach', 'lake & backwater', 'adventure / trekking',
       'wildlife', 'waterfall', 'nature & scenic', 'keys'], axis = 1, inplace =True)
df

In [24]:
df['processed_text'] = corpus
df.to_csv('destinations_with_processed_text.csv', index = False)

In [None]:
df

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus)

In [None]:
X

In [None]:
vectors = X.toarray()
vectors

In [None]:
df['vectors_cv'] = list(vectors)
df

In [32]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tv = TfidfVectorizer()
# X_tv = tv.fit_transform(corpus)

In [33]:
# X_tv.toarray()

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel
correlationMatrix = sigmoid_kernel(vectors, vectors)
print(correlationMatrix)

In [35]:
def getRecommendations(destination_name):
    idx = destination_index_dict[destination_name]
    similarity_list = correlationMatrix[idx]
    lst = []
    for i in range(len(similarity_list)):
        lst.append((similarity_list[i],i))
    return sorted(lst, reverse = True)

In [None]:
recommendations_list = getRecommendations('Dachigam National Park')
for element in recommendations_list[:10]:
    print(index_destination_dict[element[1]])

In [None]:
recommendations_list = getRecommendations('Araku Valley')
for element in recommendations_list[:10]:
    print(index_destination_dict[element[1]])

# Input a free text based query from the user

In [None]:
query = str(input('Enter a free text to get relvent destination recommendations: '))

In [39]:
#Remove Hyperlinks
processed_query = re.sub(r"http\S+", ' ', query)    
#processed_query = re.sub(r'https?:\/\/\S*', '', query, flags=re.MULTILINE)

#Remove Punctuation Marks and Special Symbols
processed_query = re.sub('[^a-zA-Z0-9]', ' ', processed_query)

#Lowercase
processed_query = processed_query.lower()

#Create a list of strings using string.split() method
processed_query = processed_query.split()

# Prefer Lemmatization over Stemming
#processed_query = [ps.stem(word) for word in processed_query if not word in stopwords.words('english')]
processed_query = [wl.lemmatize(word, pos='v') for word in processed_query if not word in stopwords.words('english')]    
processed_query = ' '.join(processed_query)
# corpus.append(processed_query)
# print(i, end = ' ')

In [None]:
processed_query

In [41]:
# corpus

In [None]:
new_corpus = []
for desc in corpus:
    new_corpus.append(desc)
new_corpus.append(processed_query)
new_corpus[0]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
new_X = cv.fit_transform(new_corpus)
new_vectors = new_X.toarray()

from sklearn.metrics.pairwise import sigmoid_kernel
new_correlationMatrix = sigmoid_kernel(new_vectors, new_vectors)
print(new_correlationMatrix)

In [None]:
list_of_tuples = []
for i in range(len(df)):
    list_of_tuples.append((new_correlationMatrix[-1][i], i))
sorted_list_of_tuples = sorted(list_of_tuples, reverse = True)
for element in sorted_list_of_tuples[:10]:
    print(index_destination_dict[element[1]])

In [45]:
def getRecommendations_based_on_free_text_based_query(query, num):
    #Remove Hyperlinks
    processed_query = re.sub(r"http\S+", ' ', query)    
    #processed_query = re.sub(r'https?:\/\/\S*', '', query, flags=re.MULTILINE)

    #Remove Punctuation Marks and Special Symbols
    processed_query = re.sub('[^a-zA-Z0-9]', ' ', processed_query)

    #Lowercase
    processed_query = processed_query.lower()

    #Create a list of strings using string.split() method
    processed_query = processed_query.split()

    # Prefer Lemmatization over Stemming
    #processed_query = [ps.stem(word) for word in processed_query if not word in stopwords.words('english')]
    processed_query = [wl.lemmatize(word, pos='v') for word in processed_query if not word in stopwords.words('english')]    
    processed_query = ' '.join(processed_query)
    # corpus.append(processed_query)
    # print(i, end = ' ')

    new_corpus = []
    for desc in corpus:
        new_corpus.append(desc)
    new_corpus.append(processed_query)
    #new_corpus

    from sklearn.feature_extraction.text import CountVectorizer
    cv = CountVectorizer()
    new_X = cv.fit_transform(new_corpus)
    new_vectors = new_X.toarray()
    
    from sklearn.metrics.pairwise import sigmoid_kernel
    new_correlationMatrix = sigmoid_kernel(new_vectors, new_vectors)
    #print(new_correlationMatrix)
    
    list_of_tuples = []
    for i in range(len(df)):
        list_of_tuples.append((new_correlationMatrix[-1][i], i))
    sorted_list_of_tuples = sorted(list_of_tuples, reverse = True)
    recommendation_list = []
    for element in sorted_list_of_tuples:
        recommendation_list.append(index_destination_dict[element[1]])
    return recommendation_list[:num]

In [None]:
query = str(input('Enter a free text to get relvent destination recommendations: '))
final_recommendation_list = getRecommendations_based_on_free_text_based_query(query, 10)
for destination in final_recommendation_list:
    print(destination)

In [47]:
def getRecommendations_based_on_free_text_based_query(query, num):
    #Remove HTML Tags
    processed_query = re.sub(r"http\S+", ' ', query)    
    #processed_query = re.sub(r'https?:\/\/\S*', '', query, flags=re.MULTILINE)

    #Remove Punctuation Marks and Special Symbols
    processed_query = re.sub('[^a-zA-Z0-9]', ' ', processed_query)

    #Lowercase
    processed_query = processed_query.lower()

    #Create a list of strings using string.split() method
    processed_query = processed_query.split()

    # Prefer Lemmatization over Stemming
    #processed_query = [ps.stem(word) for word in processed_query if not word in stopwords.words('english')]
    processed_query = [wl.lemmatize(word, pos='v') for word in processed_query if not word in stopwords.words('english')]    
    processed_query = ' '.join(processed_query)
    # corpus.append(processed_query)
    # print(i, end = ' ')

    new_corpus = []
    for desc in corpus:
        new_corpus.append(desc)
    new_corpus.append(processed_query)
    #new_corpus

    from sklearn.feature_extraction.text import CountVectorizer
    cv = CountVectorizer()
    new_X = cv.fit_transform(new_corpus)
    new_vectors = new_X.toarray()
    
    list_of_tuples = []
    from numpy.linalg import norm
    for i in range(len(df)):
        cosine_similarity = np.dot(new_vectors[i], new_vectors[-1])/(norm(new_vectors[i])*norm(new_vectors[-1]))
        list_of_tuples.append((cosine_similarity, i))
    
    sorted_list_of_tuples = sorted(list_of_tuples, reverse = True)
    recommendation_list = []
    for element in sorted_list_of_tuples:
        recommendation_list.append(index_destination_dict[element[1]])
    return recommendation_list[:num]

In [None]:
query = str(input('Enter a free text to get relvent destination recommendations: '))
final_recommendation_list = getRecommendations_based_on_free_text_based_query(query, 10)
for destination in final_recommendation_list:
    print(destination)

# Travel Destination Recommendation System
                                                                        -Kartik joshi
- Version 1: Recommendations based on Previously Visited Destination
- Version 2: Recommendations based on Free Text-based Query from the User