In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer


In [3]:
# Load the data
city = pd.read_csv('cities.csv')

# Inspect the first few rows
print(city.head())

# Handle missing values (drop the 'Best_time_to_visit' column and rows with NaN)
city = city.drop(['Best_time_to_visit'], axis=1)
city = city.dropna()

# Display information about the dataset
print(city.info())


          City  Ratings Ideal_duration  Best_time_to_visit  \
0       Manali      4.5            2-4        October-June   
1   Leh Ladakh      4.6            5-7         JulyOctober   
2        Coorg      4.2            2-3      September-June   
3      Andaman      4.5            4-6       October-March   
4  Lakshadweep      4.0            4-6  September-February   

                                           City_desc  
0  [' One of the most popular hill stations in Hi...  
1  [" Ladakh is a union territory in the Kashmir ...  
2  [' Located amidst imposing mountains in Karnat...  
3  [' Replete with turquoise blue water beaches a...  
4  [" Formerly known as Laccadive Islands, Laksha...  
<class 'pandas.core.frame.DataFrame'>
Index: 99 entries, 0 to 99
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   City            99 non-null     object 
 1   Ratings         99 non-null     float64
 2   Ideal_duration 

In [4]:
# Preprocessing: Convert 'City_desc' to lowercase and apply stemming
city['City_desc'] = city['City_desc'].apply(lambda x: x.lower())

# Initialize Porter Stemmer
ps = PorterStemmer()

def stem(text):
    """
    This function stems words in a text using the Porter Stemmer algorithm.
    """
    y = []
    for i in text.split():
        y.append(ps.stem(i))  # Apply stemming to each word
    return " ".join(y)

# Apply stemming to the 'City_desc' column
city['City_desc'] = city['City_desc'].apply(stem)


In [5]:
# Convert text data into vectors using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the city descriptions into a feature matrix
vectors = cv.fit_transform(city['City_desc']).toarray()

# Display the shape of the vectorized data
print(vectors.shape)


(99, 3499)


In [6]:
# Calculate cosine similarity between the city vectors
similarity = cosine_similarity(vectors)

# Display similarity matrix dimensions
print(similarity.shape)


(99, 99)


In [7]:
# Function to recommend cities based on similarity
def recommend(city_name):
    """
    This function recommends 5 cities similar to the input city based on cosine similarity.
    """
    # Convert the input city name to lowercase for case-insensitive matching
    city_name = city_name.lower()

    try:
        # Find the index of the city in the dataset
        city_index = city[city['City'].str.lower() == city_name].index[0]
    except IndexError:
        # Handle case if the city is not found
        print(f"The city '{city_name}' is not found in the dataset.")
        return

    # Get the similarity scores for the input city
    distances = similarity[city_index]

    # Get the 5 most similar cities (excluding the input city itself)
    city_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    # Display the recommended cities
    for i in city_list:
        similar_city = city.iloc[i[0]].City
        print(f"{city.iloc[i[0]].City} ")


In [16]:

recommend('goa')


Alibaug 
Kovalam 
Chennai 
Visakhapatnam 
Digha 
