In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load the dataset
data = pd.read_csv('dataset_NYC.txt', sep='\t', header=None, encoding='ISO-8859-1')
data.columns = ['User_ID', 'Venue_ID', 'Venue_Category_ID', 'Venue_Category_Name', 'Latitude', 'Longitude', 'Timezone_Offset', 'UTC_Time']
print(data.head())

   User_ID                  Venue_ID         Venue_Category_ID  \
0      470  49bbd6c0f964a520f4531fe3  4bf58dd8d48988d127951735   
1      979  4a43c0aef964a520c6a61fe3  4bf58dd8d48988d1df941735   
2       69  4c5cc7b485a1e21e00d35711  4bf58dd8d48988d103941735   
3      395  4bc7086715a7ef3bef9878da  4bf58dd8d48988d104941735   
4       87  4cf2c5321d18a143951b5cec  4bf58dd8d48988d1cb941735   

   Venue_Category_Name   Latitude  Longitude  Timezone_Offset  \
0  Arts & Crafts Store  40.719810 -74.002581             -240   
1               Bridge  40.606800 -74.044170             -240   
2       Home (private)  40.716162 -73.883070             -240   
3       Medical Center  40.745164 -73.982519             -240   
4           Food Truck  40.740104 -73.989658             -240   

                         UTC_Time  
0  Tue Apr 03 18:00:09 +0000 2012  
1  Tue Apr 03 18:00:25 +0000 2012  
2  Tue Apr 03 18:02:24 +0000 2012  
3  Tue Apr 03 18:02:41 +0000 2012  
4  Tue Apr 03 18:03:00 +0000 201

In [38]:
# Step 2: Preprocess the dataset
# Apply one-hot encoding for venue categories
venue_categories = pd.get_dummies(data['Venue_Category_Name'])
data = pd.concat([data, venue_categories], axis=1)

# Step 3: Function to recommend venues
def recommend_venues(user_id, category_name, top_n=5):
    # Filter data for the given user
    user_data = data[data['User_ID'] == user_id]

    # Summarize user preferences for each category
    user_profile = user_data[venue_categories.columns].sum()

    # Filter venues of the specified category
    category_data = data[data['Venue_Category_Name'] == category_name]

    # Exclude venues already visited by the user
    visited_venues = user_data['Venue_ID'].unique()
    unvisited_venues = category_data[~category_data['Venue_ID'].isin(visited_venues)]

    # Compute similarity between user profile and unvisited venues
    category_vectors = unvisited_venues[venue_categories.columns].values
    user_vector = user_profile.values.reshape(1, -1)
    similarities = cosine_similarity(user_vector, category_vectors)[0]

    # Add similarity scores to unvisited venues
    unvisited_venues = unvisited_venues.copy()
    unvisited_venues['Similarity'] = similarities


    # Sort venues by similarity and return top N
    recommendations = unvisited_venues.sort_values(by='Similarity', ascending=False).head(top_n)
    return recommendations[['Venue_ID', 'Venue_Category_Name', 'Latitude', 'Longitude', 'Similarity']]

In [29]:
# Step 4: Example usage
user_id = '293'  # Replace with a valid User_ID from the dataset
category_name = 'Bar'  # Replace with a valid category
recommendations = recommend_venues(user_id, category_name)
print(recommendations)

                        Venue_ID Venue_Category_Name   Latitude  Longitude  \
149     4f318c84e5e8657f88d830ac                 Bar  40.725730 -73.792527   
159676  3fd66200f964a520aae81ee3                 Bar  40.760753 -73.990799   
159624  43a46529f964a520492c1fe3                 Bar  40.732030 -73.985497   
159625  3fd66200f964a52060e81ee3                 Bar  40.798841 -73.963061   
159629  406dfe80f964a52048f21ee3                 Bar  40.739977 -73.993325   

        Similarity  
149            0.0  
159676         0.0  
159624         0.0  
159625         0.0  
159629         0.0  


In [30]:
user_data = data[data['User_ID'] == '293']  # Replace with your User_ID
print(user_data)
print("Number of venues visited by user:", len(user_data))
user_categories = user_data['Venue_Category_Name'].unique()
print("Categories visited by user:", user_categories)

Empty DataFrame
Columns: [User_ID, Venue_ID, Venue_Category_ID, Venue_Category_Name, Latitude, Longitude, Timezone_Offset, UTC_Time, Afghan Restaurant, African Restaurant, Airport, American Restaurant, Animal Shelter, Antique Shop, Aquarium, Arcade, Arepa Restaurant, Argentinian Restaurant, Art Gallery, Art Museum, Arts & Crafts Store, Arts & Entertainment, Asian Restaurant, Athletic & Sport, Australian Restaurant, Automotive Shop, BBQ Joint, Bagel Shop, Bakery, Bank, Bar, Beach, Beer Garden, Bike Rental / Bike Share, Bike Shop, Board Shop, Bookstore, Bowling Alley, Brazilian Restaurant, Breakfast Spot, Brewery, Bridal Shop, Bridge, Building, Burger Joint, Burrito Place, Bus Station, Café, Cajun / Creole Restaurant, Camera Store, Campground, Candy Store, Car Dealership, Car Wash, Caribbean Restaurant, Casino, Castle, Cemetery, Chinese Restaurant, Church, City, Clothing Store, Coffee Shop, College & University, College Academic Building, College Stadium, College Theater, Comedy Club, Co

In [31]:
user_activity = data['User_ID'].value_counts()
print("Total users in dataset:", len(user_activity))
print("Top active users:\n", user_activity.head(10))

Total users in dataset: 1083
Top active users:
 293    2697
185    2079
354    2061
315    1682
84     1376
349    1369
384    1116
974    1107
768    1096
445     952
Name: User_ID, dtype: int64


In [32]:
# Unique categories in the dataset
print("Unique categories:", data['Venue_Category_Name'].unique())
print("Count of 'Airport' category:", (data['Venue_Category_Name'] == 'Airport').sum())

Unique categories: ['Arts & Crafts Store' 'Bridge' 'Home (private)' 'Medical Center'
 'Food Truck' 'Food & Drink Shop' 'Coffee Shop' 'Bus Station' 'Bank'
 'Gastropub' 'Electronics Store' 'Mobile Phone Shop' 'Café'
 'Automotive Shop' 'Restaurant' 'American Restaurant'
 'Government Building' 'Airport' 'Ferry' 'Office' 'Other Great Outdoors'
 'Building' 'Mexican Restaurant' 'Music Venue' 'Subway' 'Student Center'
 'Park' 'Road' 'Burger Joint' 'Sporting Goods Shop' 'Pizza Place'
 'Jewelry Store' 'Sandwich Place' 'Clothing Store' 'Neighborhood'
 'Ice Cream Shop' 'Soup Place' 'College Academic Building'
 'Department Store' 'Playground' 'Tattoo Parlor' 'Mall' 'Deli / Bodega'
 'University' 'Diner' 'Music Store' 'Light Rail' 'Salon / Barbershop'
 'General College & University' 'Animal Shelter' 'Laundry Service'
 'Residential Building (Apartment / Condo)' 'Drugstore / Pharmacy'
 'Cuban Restaurant' 'BBQ Joint' 'Other Nightlife' 'Gym / Fitness Center'
 'Italian Restaurant' 'Stadium' 'Church' 'Trai

In [33]:
# Most active users and popular categories
top_user = user_activity.index[0]  # Pick the most active user
top_category = data['Venue_Category_Name'].value_counts().index[0]  # Pick the most common category
print("Selected User_ID:", top_user)
print("Selected Category:", top_category)

Selected User_ID: 293
Selected Category: Bar


In [34]:
recommendations = recommend_venues(top_user, top_category)
print(recommendations)

                        Venue_ID Venue_Category_Name   Latitude  Longitude  \
149     4f318c84e5e8657f88d830ac                 Bar  40.725730 -73.792527   
159679  41c0d000f964a520891e1fe3                 Bar  40.726963 -73.983750   
159636  4b99d9d0f964a520709435e3                 Bar  40.759096 -73.986461   
159639  4c9bb662e9a7ef3bd0ed2e16                 Bar  40.764213 -73.987017   
159642  4b39872ef964a5207d5d25e3                 Bar  40.691084 -73.997174   

        Similarity  
149       0.005525  
159679    0.005525  
159636    0.005525  
159639    0.005525  
159642    0.005525  


In [49]:
# Filter data for the given user
user_data = data[data['User_ID'] == top_user]

# Summarize user preferences for each category
user_profile = user_data[venue_categories.columns].sum()

# Filter venues of the specified category
category_data = data[data['Venue_Category_Name'] == top_category]

    # Exclude venues already visited by the user
visited_venues = user_data['Venue_ID'].unique()
unvisited_venues = category_data[~category_data['Venue_ID'].isin(visited_venues)]

    # Compute similarity between user profile and unvisited venues
category_vectors = unvisited_venues[venue_categories.columns].values
user_vector = user_profile.values.reshape(1, -1)
similarities = cosine_similarity(user_vector, category_vectors)[0]

    # Add similarity scores to unvisited venues
unvisited_venues = unvisited_venues.copy()
unvisited_venues['Similarity'] = similarities

In [50]:
from sklearn.preprocessing import normalize

# Normalize one-hot encoded data
normalized_categories = normalize(venue_categories.values, axis=1)
venue_categories = pd.DataFrame(normalized_categories, columns=venue_categories.columns)

In [51]:
user_profile = user_data[venue_categories.columns].sum()
category_vectors = unvisited_venues[venue_categories.columns].values

In [52]:
print("User profile variance:", user_profile.var())
print("Venue vectors variance:", category_vectors.var(axis=0).sum())

User profile variance: 1975.2194731711402
Venue vectors variance: 0.0
