In [88]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import random
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic
import folium
import numpy as np

# 1. Data Preparation

## 1.1 Input Data

In [89]:
# Load dataset directly from the zip file
data = pd.read_csv(
    '../data/dataset_NYC.zip',  # Path to the zip file
    sep='\t',
    header=None,
    encoding='ISO-8859-1',
    names=['User_ID', 'Venue_ID', 'Venue_Category_ID', 'Category_Name',
           'Latitude', 'Longitude', 'Timezone_Offset', 'UTC_Time'],
    dtype={
        'User_ID': 'str',
        'Venue_ID': 'str',
        'Venue_Category_ID': 'str',
        'Category_Name': 'category',
        'Latitude': 'float32',
        'Longitude': 'float32',
        'Timezone_Offset': 'int16',
        'UTC_Time': 'str'
    },
    compression='zip'  # This tells pandas to handle the zip automatically
)

# Print the first few rows of the dataset
print(data.head())

  User_ID                  Venue_ID         Venue_Category_ID  \
0     470  49bbd6c0f964a520f4531fe3  4bf58dd8d48988d127951735   
1     979  4a43c0aef964a520c6a61fe3  4bf58dd8d48988d1df941735   
2      69  4c5cc7b485a1e21e00d35711  4bf58dd8d48988d103941735   
3     395  4bc7086715a7ef3bef9878da  4bf58dd8d48988d104941735   
4      87  4cf2c5321d18a143951b5cec  4bf58dd8d48988d1cb941735   

         Category_Name   Latitude  Longitude  Timezone_Offset  \
0  Arts & Crafts Store  40.719810 -74.002579             -240   
1               Bridge  40.606800 -74.044167             -240   
2       Home (private)  40.716160 -73.883072             -240   
3       Medical Center  40.745163 -73.982521             -240   
4           Food Truck  40.740105 -73.989655             -240   

                         UTC_Time  
0  Tue Apr 03 18:00:09 +0000 2012  
1  Tue Apr 03 18:00:25 +0000 2012  
2  Tue Apr 03 18:02:24 +0000 2012  
3  Tue Apr 03 18:02:41 +0000 2012  
4  Tue Apr 03 18:03:00 +0000 2012  


## 1.2 Data Cleaning

In [90]:
# Remove duplicates
data = data.drop_duplicates()
data.reset_index(drop=True, inplace=True)

# Handle missing values
data = data.dropna()

### Convert to Local Time

In [91]:
# Convert UTC time
data['UTC_Time'] = pd.to_datetime(data['UTC_Time'], format="%a %b %d %H:%M:%S %z %Y", errors='coerce')
data = data.dropna(subset=['UTC_Time'])

# Add timezone and local time
data['Timezone_Offset'] = pd.to_timedelta(data['Timezone_Offset'], unit='m')
data['Local_Time'] = data['UTC_Time'] + data['Timezone_Offset']

# Remove UTC label after applying the offset
data['Local_Time'] = data['Local_Time'].dt.tz_convert(None)

# Drop unnecessary columns
data = data.drop(columns=['UTC_Time', 'Timezone_Offset'])
 
data.head()

Unnamed: 0,User_ID,Venue_ID,Venue_Category_ID,Category_Name,Latitude,Longitude,Local_Time
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002579,2012-04-03 14:00:09
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.044167,2012-04-03 14:00:25
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.71616,-73.883072,2012-04-03 14:02:24
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745163,-73.982521,2012-04-03 14:02:41
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740105,-73.989655,2012-04-03 14:03:00


## 1.3 Add Broader Categories

In [92]:
# Load category mapping
category_table = pd.read_csv('../data/categories.zip')

# Add missing categories dynamically if needed
missing_category = pd.DataFrame({
    'Category ID': ['4e51a0c0bd41d3446defbb2e'],
    'Category Name': ['Ferry'],
    'Category Label': ['Travel and Transportation > Ferry']
})
category_table = pd.concat([category_table, missing_category], ignore_index=True)

# Rename and merge
category_table.rename(columns={'Category ID': 'Venue_Category_ID'}, inplace=True)
data = data.merge(category_table, on='Venue_Category_ID', how='left')

# Extract broader categories
data['Broader_Category'] = data['Category Label'].str.split(' > ').str[0]
print(data[['Category_Name', 'Broader_Category']].head())

         Category_Name          Broader_Category
0  Arts & Crafts Store                    Retail
1               Bridge    Landmarks and Outdoors
2       Home (private)  Community and Government
3       Medical Center       Health and Medicine
4           Food Truck       Dining and Drinking


In [93]:
# Keep only the relevant columns
data = data.drop(columns=['Category Label','Category Name'])

# Check the cleaned data
print(data.head())

  User_ID                  Venue_ID         Venue_Category_ID  \
0     470  49bbd6c0f964a520f4531fe3  4bf58dd8d48988d127951735   
1     979  4a43c0aef964a520c6a61fe3  4bf58dd8d48988d1df941735   
2      69  4c5cc7b485a1e21e00d35711  4bf58dd8d48988d103941735   
3     395  4bc7086715a7ef3bef9878da  4bf58dd8d48988d104941735   
4      87  4cf2c5321d18a143951b5cec  4bf58dd8d48988d1cb941735   

         Category_Name   Latitude  Longitude          Local_Time  \
0  Arts & Crafts Store  40.719810 -74.002579 2012-04-03 14:00:09   
1               Bridge  40.606800 -74.044167 2012-04-03 14:00:25   
2       Home (private)  40.716160 -73.883072 2012-04-03 14:02:24   
3       Medical Center  40.745163 -73.982521 2012-04-03 14:02:41   
4           Food Truck  40.740105 -73.989655 2012-04-03 14:03:00   

           Broader_Category  
0                    Retail  
1    Landmarks and Outdoors  
2  Community and Government  
3       Health and Medicine  
4       Dining and Drinking  


In [94]:
#Handle missing values: Check and impute or drop missing data if necessary.
print(data.isnull().sum())  # Check for missing values

# Inspect rows with missing Broader_Category
missing_broader_category = data[data['Broader_Category'].isnull()]
print(missing_broader_category[['Venue_Category_ID', 'Category_Name']])

User_ID              0
Venue_ID             0
Venue_Category_ID    0
Category_Name        0
Latitude             0
Longitude            0
Local_Time           0
Broader_Category     0
dtype: int64
Empty DataFrame
Columns: [Venue_Category_ID, Category_Name]
Index: []


In [95]:
# Group by broader category
broader_analysis = data.groupby('Broader_Category').agg(
    Venue_Count=('Venue_ID', 'nunique'),
    User_Count=('User_ID', 'nunique'),
    Average_Latitude=('Latitude', 'mean'),
    Average_Longitude=('Longitude', 'mean')
).reset_index()

# Display the analysis
print(broader_analysis)

                     Broader_Category  Venue_Count  User_Count  \
0              Arts and Entertainment         2000        1017   
1  Business and Professional Services         3527        1022   
2            Community and Government         4347         979   
3                 Dining and Drinking        13909        1080   
4                 Health and Medicine          843         494   
5              Landmarks and Outdoors         3111        1024   
6                      Nightlife Spot           68          95   
7                              Retail         5745        1064   
8               Sports and Recreation          997         729   
9           Travel and Transportation         3854        1045   

   Average_Latitude  Average_Longitude  
0         40.760483         -73.983269  
1         40.752331         -73.984848  
2         40.765167         -73.974915  
3         40.749542         -73.977554  
4         40.777157         -73.962959  
5         40.752041        

## 1.4 Feature Engineering

### Derive Temporal Features

In [96]:
# Extract day of the week
data['Day_of_Week'] = data['Local_Time'].dt.day_name()

# Identify if the visit was on a weekend
data['Is_Weekend'] = data['Day_of_Week'].isin(['Saturday', 'Sunday']).astype(int)

# Extract hour to categorize the time of visit
data['Hour'] = data['Local_Time'].dt.hour

# Create time buckets (e.g., Morning, Afternoon, Evening, Night)
def time_bucket(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

data['Time_Bucket'] = data['Hour'].apply(time_bucket)

print(data[['Local_Time', 'Day_of_Week', 'Is_Weekend', 'Hour', 'Time_Bucket']].head())


           Local_Time Day_of_Week  Is_Weekend  Hour Time_Bucket
0 2012-04-03 14:00:09     Tuesday           0    14   Afternoon
1 2012-04-03 14:00:25     Tuesday           0    14   Afternoon
2 2012-04-03 14:02:24     Tuesday           0    14   Afternoon
3 2012-04-03 14:02:41     Tuesday           0    14   Afternoon
4 2012-04-03 14:03:00     Tuesday           0    14   Afternoon


### Create User Profiles

- Summarized user preferences:
  - Most frequently visited category (Category_Name_Preferred).
  - Most frequent time of visit (Time_Bucket_Preferred).

In [97]:
# Most visited category for each user
user_top_category = (
    data.groupby(['User_ID', 'Category_Name'])
    .size()
    .reset_index(name='Visit_Count')
    .sort_values(['User_ID', 'Visit_Count'], ascending=[True, False])
    .drop_duplicates('User_ID')
)
data = data.merge(user_top_category[['User_ID', 'Category_Name']], on='User_ID', how='left', suffixes=('', '_Preferred'))

# Most frequent time bucket for each user
user_top_time = (
    data.groupby(['User_ID', 'Time_Bucket'])
    .size()
    .reset_index(name='Visit_Count')
    .sort_values(['User_ID', 'Visit_Count'], ascending=[True, False])
    .drop_duplicates('User_ID')
)
data = data.merge(user_top_time[['User_ID', 'Time_Bucket']], on='User_ID', how='left', suffixes=('', '_Preferred'))


  data.groupby(['User_ID', 'Category_Name'])


In [98]:
data.head()

Unnamed: 0,User_ID,Venue_ID,Venue_Category_ID,Category_Name,Latitude,Longitude,Local_Time,Broader_Category,Day_of_Week,Is_Weekend,Hour,Time_Bucket,Category_Name_Preferred,Time_Bucket_Preferred
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002579,2012-04-03 14:00:09,Retail,Tuesday,0,14,Afternoon,Gym / Fitness Center,Morning
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.044167,2012-04-03 14:00:25,Landmarks and Outdoors,Tuesday,0,14,Afternoon,Beach,Afternoon
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.71616,-73.883072,2012-04-03 14:02:24,Community and Government,Tuesday,0,14,Afternoon,Home (private),Afternoon
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745163,-73.982521,2012-04-03 14:02:41,Health and Medicine,Tuesday,0,14,Afternoon,Office,Morning
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740105,-73.989655,2012-04-03 14:03:00,Dining and Drinking,Tuesday,0,14,Afternoon,Office,Afternoon


### Venue Popularity
- Compute venue popularity based on visit frequency.
- Normalize popularity to range between 0 and 1 for comparison.

In [99]:
# Compute venue popularity
venue_popularity = data.groupby('Venue_ID')['User_ID'].count().reset_index(name='totalVisits')

# Normalize popularity
venue_popularity['Popularity_Score'] = venue_popularity['totalVisits'] / venue_popularity['totalVisits'].max()

# Merge popularity back into the main dataset
data = data.merge(venue_popularity[['Venue_ID', 'Popularity_Score']], on='Venue_ID', how='left')
data = data.merge(venue_popularity[['Venue_ID', 'totalVisits']], on='Venue_ID', how='left')

print(data[['Venue_ID', 'totalVisits' , 'Popularity_Score']].head())

                   Venue_ID  totalVisits  Popularity_Score
0  49bbd6c0f964a520f4531fe3           33          0.028821
1  4a43c0aef964a520c6a61fe3          134          0.117031
2  4c5cc7b485a1e21e00d35711          104          0.090830
3  4bc7086715a7ef3bef9878da            2          0.001747
4  4cf2c5321d18a143951b5cec           10          0.008734


In [100]:
# Count visits per venue and time bucket
venue_time_bucket_visits = data.groupby(['Venue_ID', 'Time_Bucket']).size().reset_index(name='Visit_Count')
print(venue_time_bucket_visits.head())

# Identify the most popular time bucket for each venue
venue_top_time_bucket = venue_time_bucket_visits.sort_values(['Venue_ID', 'Visit_Count'], ascending=[True, False])\
    .drop_duplicates('Venue_ID')\
    .rename(columns={'Time_Bucket': 'Busy_TimeBucket', 'Visit_Count': 'Max_Visit_Count'})

print(venue_top_time_bucket.head())

                   Venue_ID Time_Bucket  Visit_Count
0  3fd66200f964a52000e71ee3   Afternoon            1
1  3fd66200f964a52000e71ee3     Evening            8
2  3fd66200f964a52000e71ee3     Morning            1
3  3fd66200f964a52000e71ee3       Night            6
4  3fd66200f964a52000e81ee3   Afternoon            1
                    Venue_ID Busy_TimeBucket  Max_Visit_Count
1   3fd66200f964a52000e71ee3         Evening                8
4   3fd66200f964a52000e81ee3       Afternoon                1
6   3fd66200f964a52000f11ee3           Night                1
7   3fd66200f964a52001e51ee3           Night                1
10  3fd66200f964a52001e81ee3           Night                9


In [101]:
# Merge the most popular time bucket back into the main dataset
data = data.merge(venue_top_time_bucket[['Venue_ID', 'Busy_TimeBucket']], on='Venue_ID', how='left')
print(data[['Venue_ID', 'Category_Name', 'Busy_TimeBucket']].head(10))

                   Venue_ID        Category_Name Busy_TimeBucket
0  49bbd6c0f964a520f4531fe3  Arts & Crafts Store       Afternoon
1  4a43c0aef964a520c6a61fe3               Bridge         Evening
2  4c5cc7b485a1e21e00d35711       Home (private)           Night
3  4bc7086715a7ef3bef9878da       Medical Center       Afternoon
4  4cf2c5321d18a143951b5cec           Food Truck       Afternoon
5  4b5b981bf964a520900929e3    Food & Drink Shop       Afternoon
6  4ab966c3f964a5203c7f20e3          Coffee Shop       Afternoon
7  4d0cc47f903d37041864bf55          Bus Station         Morning
8  4ce1863bc4f6a35d8bd2db6c       Home (private)         Evening
9  4be319b321d5a59352311811                 Bank       Afternoon


In [102]:
data.head()

Unnamed: 0,User_ID,Venue_ID,Venue_Category_ID,Category_Name,Latitude,Longitude,Local_Time,Broader_Category,Day_of_Week,Is_Weekend,Hour,Time_Bucket,Category_Name_Preferred,Time_Bucket_Preferred,Popularity_Score,totalVisits,Busy_TimeBucket
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002579,2012-04-03 14:00:09,Retail,Tuesday,0,14,Afternoon,Gym / Fitness Center,Morning,0.028821,33,Afternoon
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.044167,2012-04-03 14:00:25,Landmarks and Outdoors,Tuesday,0,14,Afternoon,Beach,Afternoon,0.117031,134,Evening
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.71616,-73.883072,2012-04-03 14:02:24,Community and Government,Tuesday,0,14,Afternoon,Home (private),Afternoon,0.09083,104,Night
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745163,-73.982521,2012-04-03 14:02:41,Health and Medicine,Tuesday,0,14,Afternoon,Office,Morning,0.001747,2,Afternoon
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740105,-73.989655,2012-04-03 14:03:00,Dining and Drinking,Tuesday,0,14,Afternoon,Office,Afternoon,0.008734,10,Afternoon


### Geographic Features
- Calculate the average latitude and longitude for each user to estimate their preferred region.
- Compute the distance of venues from the user’s central location.

In [103]:
# Compute user's average latitude and longitude
user_location_center = data.groupby('User_ID')[['Latitude', 'Longitude']].mean().reset_index()
user_location_center.rename(columns={'Latitude': 'Avg_Latitude', 'Longitude': 'Avg_Longitude'}, inplace=True)

# Merge user's central location into the main dataset
data = data.merge(user_location_center, on='User_ID', how='left')

# Compute distance from the user's central location
from haversine import haversine, Unit

# Vectorized computation of distances
data['Distance_From_Center'] = data.apply(
    lambda row: haversine(
        (row['Avg_Latitude'], row['Avg_Longitude']),
        (row['Latitude'], row['Longitude']),
        unit=Unit.KILOMETERS
    ), axis=1
)

In [104]:
print(data[['User_ID', 'Venue_ID', 'Distance_From_Center']].head())
print(data.nsmallest(10,'Distance_From_Center'))

  User_ID                  Venue_ID  Distance_From_Center
0     470  49bbd6c0f964a520f4531fe3              3.416134
1     979  4a43c0aef964a520c6a61fe3              2.544965
2      69  4c5cc7b485a1e21e00d35711              0.440875
3     395  4bc7086715a7ef3bef9878da              0.965516
4      87  4cf2c5321d18a143951b5cec              0.892979
       User_ID                  Venue_ID         Venue_Category_ID  \
102490      59  4c2b99aab34ad13a0f0beace  4bf58dd8d48988d118951735   
103596      14  4c83a1e451ada1cd8c332710  4bf58dd8d48988d1e0931735   
108749      14  4c83a1e451ada1cd8c332710  4bf58dd8d48988d1e0931735   
113227      14  4c83a1e451ada1cd8c332710  4bf58dd8d48988d1e0931735   
114953      14  4c83a1e451ada1cd8c332710  4bf58dd8d48988d1e0931735   
117802      14  4c83a1e451ada1cd8c332710  4bf58dd8d48988d1e0931735   
122588      14  4c83a1e451ada1cd8c332710  4bf58dd8d48988d1e0931735   
123173      14  4c83a1e451ada1cd8c332710  4bf58dd8d48988d1e0931735   
125194      14  4c83a1

In [105]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227178 entries, 0 to 227177
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   User_ID                  227178 non-null  object        
 1   Venue_ID                 227178 non-null  object        
 2   Venue_Category_ID        227178 non-null  object        
 3   Category_Name            227178 non-null  category      
 4   Latitude                 227178 non-null  float32       
 5   Longitude                227178 non-null  float32       
 6   Local_Time               227178 non-null  datetime64[ns]
 7   Broader_Category         227178 non-null  object        
 8   Day_of_Week              227178 non-null  object        
 9   Is_Weekend               227178 non-null  int64         
 10  Hour                     227178 non-null  int32         
 11  Time_Bucket              227178 non-null  object        
 12  Category_Name_Pr

### Create Profiles

In [106]:
# Precompute user profiles
user_profiles = data.groupby('User_ID').agg(
    Preferred_Category=('Category_Name_Preferred', 'first'),
    Preferred_Time_Bucket=('Time_Bucket_Preferred', 'first'),
    Avg_Latitude=('Avg_Latitude', 'mean'),
    Avg_Longitude=('Avg_Longitude', 'mean')
    
).reset_index()

# Precompute venue profiles
venue_profiles = data.groupby('Venue_ID').agg(
    Popularity_Score=('Popularity_Score', 'mean'),
    Latitude=('Latitude', 'mean'),
    Longitude=('Longitude', 'mean')
).reset_index()


### Save Engineered Features

In [107]:
# Save the engineered dataset
#data.to_csv('../data/engineered_dataset.csv', index=False)

# Step 6: Recommendation System

## 1. Recommend 10 unvisited Locations

#### 1.1 Recommendations based on Userprofile

**Algorithm:**
- Retrieve Broader Category: For the input category_name, find its corresponding Broader_Category.
- Filter Venues: Include only venues that share the same broader category.
- Exclude Visited Venues: Remove venues the user has already visited.
- Score and Rank: Rank venues based on the calculated score.

In [108]:
def recommend_similar_category_locations(user_id, category_name, data, top_k=10):
    """
    Recommend unique venues of a similar category for a user.

    Args:
        user_id (str): User ID.
        category_name (str): The specific venue category to find similar categories.
        data (pd.DataFrame): Dataset with user and venue information.
        top_k (int): Number of recommendations to return.

    Returns:
        pd.DataFrame: Top recommended venues with scores.
    """
    # Normalize input category name
    category_name = category_name.lower()
    
    # Get the broader category for the input category
    try:
        broader_category = data.loc[
            data['Category_Name'].str.lower() == category_name, 'Broader_Category'
        ].values[0]
    except IndexError:
        raise ValueError(f"Category name '{category_name}' not found in the dataset.")
    
    # Filter data for venues in the broader category
    similar_venues = data[data['Broader_Category'] == broader_category]

    # Drop duplicate venues
    similar_venues = similar_venues.drop_duplicates(subset='Venue_ID')

    # Exclude venues already visited by the user
    visited = set(data[data['User_ID'] == user_id]['Venue_ID'])
    unvisited = similar_venues[~similar_venues['Venue_ID'].isin(visited)].copy()
    
    if unvisited.empty:
        return pd.DataFrame(columns=['Venue_ID', 'Category_Name', 'Score'])
    
    # Calculate scores based on popularity and proximity
    unvisited['Score'] = unvisited['Popularity_Score'] / (1 + unvisited['Distance_From_Center'])
    
    # Return the top-k unique venues
    return unvisited.nlargest(top_k, 'Score')[['Venue_ID', 'Category_Name', 'Score', 'Latitude', 'Longitude']]

##### Example Usage

In [109]:
# Example usage of the recommendation function
user_id = '20'
category_name = 'Subway'
# Generate recommendations
recommendations = recommend_similar_category_locations(
    user_id=user_id,
    category_name=category_name,
    data=data,
    top_k=10
)
print(recommendations)

#Problem : 
# same recommendations each time with the same VenueID 
# User is supposed to input a Venue_Category_ID (maybe first check if there are different Category_IDs with the same Category_Name)

                       Venue_ID         Category_Name     Score   Latitude  \
167    42829c80f964a5206a221fe3         Train Station  0.710441  40.752808   
552    42911d00f964a520f5231fe3         Train Station  0.320038  40.750793   
389    4840fe6bf964a52030501fe3           Bus Station  0.233641  40.757221   
445    4a4821f5f964a52095aa1fe3                  Road  0.218146  40.761169   
137    4a737bf8f964a52091dc1fe3         Train Station  0.095176  40.707150   
2832   4b837318f964a520a30631e3  Gas Station / Garage  0.084362  40.672874   
33847  4afefbe2f964a5207b3222e3                 Ferry  0.057921  40.759449   
1079   4d3e1b47557d6dcb4c484b44                  Road  0.057631  40.965351   
103    43a52546f964a520532c1fe3               Airport  0.053173  40.645088   
1177   4a0c2e7bf964a52014751fe3                 Hotel  0.052181  40.762245   

       Longitude  
167   -73.977173  
552   -73.993576  
389   -73.991547  
445   -74.011292  
137   -73.990570  
2832  -73.870697  
33847 -7

#### 1.2 Recommendations based on similar users

**Collaborative Filtering**

#### 1.2.1. Construct the User-Category Matrix
user-item interaction matrix where:
- Rows represent users.
- Columns represent venues or categories.
- Values represent interaction strength (e.g., visit frequency or normalized score).

In [110]:
# Aggregate features at the User-Category level
user_category_features = data.groupby(['User_ID', 'Category_Name']).agg(
    total_visits=('totalVisits', 'sum'),
    avg_popularity=('Popularity_Score', 'mean'),
    weekend_visits=('Is_Weekend', 'sum'),
    avg_distance=('Distance_From_Center', 'mean')
).reset_index()

# Normalize numerical features to ensure fair weighting
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
user_category_features[['total_visits', 'avg_popularity', 'weekend_visits', 'avg_distance']] = scaler.fit_transform(
    user_category_features[['total_visits', 'avg_popularity', 'weekend_visits', 'avg_distance']]
)

# Create a pivot table
user_category_matrix = user_category_features.pivot_table(
    index='User_ID',
    columns='Category_Name',
    values=['total_visits', 'avg_popularity', 'weekend_visits', 'avg_distance'],
    fill_value=0
)

# Flatten multi-level columns for readability
user_category_matrix.columns = ['_'.join(col).strip() for col in user_category_matrix.columns]
print(user_category_matrix.head())

  user_category_features = data.groupby(['User_ID', 'Category_Name']).agg(
  user_category_matrix = user_category_features.pivot_table(


         avg_distance_Afghan Restaurant  avg_distance_African Restaurant  \
User_ID                                                                    
1                                   0.0                              0.0   
10                                  0.0                              0.0   
100                                 0.0                              0.0   
1000                                0.0                              0.0   
1001                                0.0                              0.0   

         avg_distance_Airport  avg_distance_American Restaurant  \
User_ID                                                           
1                    0.323900                          0.100439   
10                   0.467233                          0.095215   
100                  0.231046                          0.000000   
1000                 0.000000                          0.000000   
1001                 0.329415                          0.053867  

#### 1.2.2. Compute User Similarity

In [111]:
# Compute cosine similarity between users
user_similarity = cosine_similarity(user_category_matrix)

# Convert similarity matrix to a DataFrame
user_similarity_df = pd.DataFrame(
    user_similarity, 
    index=user_category_matrix.index, 
    columns=user_category_matrix.index
)

print(user_similarity_df.head())


User_ID         1        10       100      1000      1001      1002      1003  \
User_ID                                                                         
1        1.000000  0.418525  0.443328  0.047425  0.397440  0.159501  0.114458   
10       0.418525  1.000000  0.371499  0.143007  0.412085  0.299409  0.102807   
100      0.443328  0.371499  1.000000  0.038430  0.262777  0.187770  0.141071   
1000     0.047425  0.143007  0.038430  1.000000  0.328168  0.199540  0.022472   
1001     0.397440  0.412085  0.262777  0.328168  1.000000  0.286014  0.093820   

User_ID      1004      1005      1006  ...       990       991       992  \
User_ID                                ...                                 
1        0.559435  0.330126  0.045446  ...  0.209991  0.108572  0.344313   
10       0.387974  0.433227  0.137103  ...  0.225525  0.244318  0.424955   
100      0.392256  0.329646  0.127847  ...  0.179412  0.144347  0.365112   
1000     0.108207  0.322047  0.053607  ...  0.000000

#### 1.2.3. Recommendations based on similar users

In [112]:
def recommend_collaborative(user_id, category_name, data, user_similarity_df, user_category_matrix, top_k=10):
    """
    Recommend unvisited locations for a user based on collaborative filtering.

    Args:
        user_id (str): The target user ID.
        category_name (str): The specific category to find similar venues.
        data (pd.DataFrame): Dataset with user and venue information.
        user_similarity_df (pd.DataFrame): User similarity matrix.
        user_category_matrix (pd.DataFrame): User-category interaction matrix.
        top_k (int): Number of recommendations to return.

    Returns:
        pd.DataFrame: Top recommended venues.
    """
    # Find similar users
    similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False).iloc[1:11].index
    

    # Filter data for the target category and similar users
    similar_user_venues = data[
        (data['User_ID'].isin(similar_users)) & 
        (data['Category_Name'] == category_name)
    ]

    # Exclude venues already visited by the target user
    visited_venues = set(data[data['User_ID'] == user_id]['Venue_ID'])
    unvisited = similar_user_venues[~similar_user_venues['Venue_ID'].isin(visited_venues)]

    unique_unvisited = unvisited.drop_duplicates(subset='Venue_ID')

    # Aggregate scores by venue
    recommendations = unique_unvisited.groupby('Venue_ID').agg(
        Popularity=('Popularity_Score', 'mean'),
        Latitude=('Latitude', 'first'),
        Longitude=('Longitude', 'first'),
        Category_Name=('Category_Name', 'first')
    ).reset_index()

    # Calculate scores (you can adjust weights for better personalization)
    recommendations['Score'] = recommendations['Popularity'] 
    
    return recommendations.nlargest(top_k, 'Score')

In [113]:
recommendations = recommend_collaborative(
    user_id='20',
    category_name='Subway',
    data=data,
    user_similarity_df=user_similarity_df,
    user_category_matrix=user_category_matrix,
    top_k=10
)

print(recommendations)

                    Venue_ID  Popularity   Latitude  Longitude Category_Name  \
14  4b0737c3f964a520e4f922e3    0.149345  40.756508 -73.988029        Subway   
0   4a4d10fbf964a52082ad1fe3    0.133624  40.678375 -73.903397        Subway   
8   4ad68321f964a5208e0721e3    0.103930  40.739773 -74.002579        Subway   
19  4b244b63f964a520766524e3    0.068122  40.833046 -73.860817        Subway   
7   4acfc371f964a520d2d520e3    0.067249  40.684307 -73.977600        Subway   
28  4b5b08c9f964a520f9df28e3    0.066376  40.827862 -73.925652        Subway   
10  4ae6258af964a52013a521e3    0.062009  40.815540 -73.958420        Subway   
35  4bba1bab7421a5936165c340    0.060262  40.751633 -73.975883        Subway   
18  4b21aee7f964a5201b4024e3    0.056769  40.779499 -73.955849        Subway   
24  4b4f3d64f964a5206cfe26e3    0.054148  40.803894 -73.937622        Subway   

       Score  
14  0.149345  
0   0.133624  
8   0.103930  
19  0.068122  
7   0.067249  
28  0.066376  
10  0.062009  

In [114]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def plot_user_clusters(user_similarity_df):
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(user_similarity_df)
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1])
    plt.title("User Clusters")
    plt.show()

## 2. Find top 10 most similar users
The function will return a list of the top 10 most similar users based on:

- Shared category preferences.
- Similar time preferences.
- Proximity of average visited locations.

#### Step 1: Feature Engineering

- Category-Level Preferences:
    - Category_Name_Preferred: Indicates the user's most preferred category.
    - One-hot encode this feature for similarity computation.
- Time Preferences:
    - Time_Bucket_Preferred: Reflects the user's preferred time of day.
    - One-hot encode this feature as well.
- Geographical Behavior:
    - Avg_Latitude and Avg_Longitude: Represents the user's average location for venue visits.
    - Normalize these features for better integration into the similarity matrix.

#### Step 2: Encode and Normalize Features

In [115]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd

# Extract relevant user features
user_features = data[['User_ID', 'Category_Name_Preferred', 'Time_Bucket_Preferred', 'Avg_Latitude', 'Avg_Longitude']].drop_duplicates()

# One-hot encode categorical features
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(user_features[['Category_Name_Preferred', 'Time_Bucket_Preferred']])

# Convert encoded features to DataFrame
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out())

# Normalize numerical features
scaler = MinMaxScaler()
normalized_coords = scaler.fit_transform(user_features[['Avg_Latitude', 'Avg_Longitude']])
normalized_coords_df = pd.DataFrame(normalized_coords, columns=['Avg_Latitude', 'Avg_Longitude'])

# Combine all features
user_features_combined = pd.concat([user_features[['User_ID']].reset_index(drop=True), encoded_df, pd.DataFrame(normalized_coords, columns=['Avg_Latitude', 'Avg_Longitude'])], axis=1)

print(user_features_combined.head())

  User_ID  Category_Name_Preferred_Airport  \
0     470                              0.0   
1     979                              0.0   
2      69                              0.0   
3     395                              0.0   
4      87                              0.0   

   Category_Name_Preferred_American Restaurant  \
0                                          0.0   
1                                          0.0   
2                                          0.0   
3                                          0.0   
4                                          0.0   

   Category_Name_Preferred_Arcade  Category_Name_Preferred_Art Gallery  \
0                             0.0                                  0.0   
1                             0.0                                  0.0   
2                             0.0                                  0.0   
3                             0.0                                  0.0   
4                             0.0                   

#### Step 3: Compute User Similarity

In [116]:
from sklearn.metrics.pairwise import cosine_similarity

# Set User_ID as the index
user_features_matrix = user_features_combined.set_index('User_ID')

# Compute cosine similarity
user_similarity = cosine_similarity(user_features_matrix)

# Convert similarity matrix to DataFrame
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_features_matrix.index,
    columns=user_features_matrix.index
)

print(user_similarity_df.head())

User_ID       470       979        69       395        87       484       642  \
User_ID                                                                         
470      1.000000  0.094851  0.182301  0.583422  0.161888  0.160333  0.152636   
979      0.094851  1.000000  0.545451  0.093091  0.536864  0.096746  0.541381   
69       0.182301  0.545451  1.000000  0.180690  0.586163  0.180198  0.584977   
395      0.583422  0.093091  0.180690  1.000000  0.580550  0.159634  0.151629   
87       0.161888  0.536864  0.586163  0.580550  1.000000  0.156180  0.573690   

User_ID       292       428       877  ...       687      1046       998  \
User_ID                                ...                                 
470      0.595434  0.545479  0.110908  ...  0.204506  0.545791  0.179316   
979      0.105093  0.075322  0.540242  ...  0.541560  0.043311  0.532644   
69       0.207686  0.130923  0.981715  ...  0.614646  0.513653  0.592252   
395      0.596041  0.543787  0.108614  ...  0.203375

#### Step 4: Find the Top 10 Most Similar Users

Extract the most similar users for a given *user_id*.

In [117]:
def find_top_similar_users(user_id, user_similarity_df, top_n=10):
    """
    Find the top N most similar users for a given user.

    Args:
        user_id (str): The user ID to find similar users for.
        user_similarity_df (pd.DataFrame): User similarity matrix.
        top_n (int): Number of similar users to return.

    Returns:
        pd.Series: Top N similar users and their similarity scores.
    """
    if user_id not in user_similarity_df.index:
        raise ValueError(f"User ID {user_id} not found in the dataset.")
    
    # Sort similar users by similarity score, excluding the user themselves
    similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False).iloc[1:top_n + 1]
    return similar_users

In [118]:
# Example usage
user_id = '20'
top_similar_users = find_top_similar_users(user_id, user_similarity_df, top_n=10)
print(top_similar_users)

User_ID
992    0.999628
393    0.997763
826    0.989362
355    0.989355
553    0.988840
686    0.988047
900    0.987920
136    0.987869
137    0.986297
966    0.986005
Name: 20, dtype: float64


## 3. Recommend a place where 5 people can meet
Content-Based Filtering:
- Content-based filtering recommends venues by matching user preferences to venue features.

#### 1. Randomly Select One Check-in Per User

In [119]:
import random

def select_random_checkins(user_ids, data):
    """
    Randomly select one check-in per user from the dataset.

    Args:
        user_ids (list): List of user IDs.
        data (pd.DataFrame): Dataset with user and check-in information.

    Returns:
        pd.DataFrame: Selected check-ins for the given users.
    """
    # Filter data for the specified user IDs
    user_checkins = data[data['User_ID'].isin(user_ids)]
    
    # Randomly select one check-in per user
    random_checkins = user_checkins.groupby('User_ID').apply(lambda x: x.sample(1)).reset_index(drop=True)
    
    return random_checkins[['User_ID', 'Latitude', 'Longitude']]


#### 2. Find the Central Meeting Point

In [120]:
def get_central_meeting_point(selected_checkins):
    """
    Calculate the central meeting point for the selected check-ins.

    Args:
        selected_checkins (pd.DataFrame): Selected check-ins with Latitude and Longitude.

    Returns:
        tuple: Central latitude and longitude for the meeting point.
    """
    central_lat = selected_checkins['Latitude'].mean()
    central_lon = selected_checkins['Longitude'].mean()
    
    return central_lat, central_lon


#### 3. Use KNN to Find the Nearest Venues

Use the KNN-based nearest venue finder, ensuring that venues are unique.

In [121]:
from sklearn.neighbors import NearestNeighbors
from geopy.distance import geodesic

def find_nearest_venues(central_point, data, k=1):
    """
    Use KNN to find the nearest venues to the central meeting point, ensuring unique Venue_IDs.

    Args:
        central_point (tuple): Central latitude and longitude.
        data (pd.DataFrame): Dataset with venue information.
        k (int): Number of nearest venues to return.

    Returns:
        pd.DataFrame: The k nearest unique venues.
    """
    # Ensure unique Venue_IDs
    data = data.drop_duplicates(subset='Venue_ID')

    # Prepare venue location data
    venue_locations = data[['Latitude', 'Longitude']].to_numpy()

    # Initialize KNN model
    knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    knn.fit(venue_locations)

    # Find the nearest venues
    distances, indices = knn.kneighbors([central_point])

    # Extract the nearest venues
    nearest_venues = data.iloc[indices[0]].copy()
    nearest_venues['Distance_From_Central'] = distances[0]

    return nearest_venues


#### 4. Recommend meeting Place

In [122]:
def recommend_meeting_place_random_checkins(user_ids, data, k=1):
    """
    Recommend the nearest meeting place for a group of users by selecting random check-ins.

    Args:
        user_ids (list): List of user IDs.
        data (pd.DataFrame): Dataset with user and venue information.
        k (int): Number of nearest venues to return.

    Returns:
        tuple: The selected check-ins and the nearest venue(s).
    """
    # Step 1: Randomly select one check-in per user
    selected_checkins = select_random_checkins(user_ids, data)
    
    # Step 2: Calculate the central meeting point
    central_point = get_central_meeting_point(selected_checkins)
    
    # Step 3: Find the nearest venues
    nearest_venues = find_nearest_venues(central_point, data, k=k)
    
    return selected_checkins, nearest_venues


In [123]:
# Example list of user IDs
user_ids = ['470', '979', '69', '395', '87']

# Call the function to get recommendations
selected_checkins, nearest_venues = recommend_meeting_place_random_checkins(user_ids, data, k=5)

# Display the selected check-ins and the nearest venues
print("Selected Check-ins:")
print(selected_checkins)

print("\nNearest Venues:")
print(nearest_venues[['Venue_ID', 'Category_Name', 'Latitude', 'Longitude', 'Distance_From_Central']])


Selected Check-ins:
  User_ID   Latitude  Longitude
0     395  40.748005 -74.008629
1     470  40.762802 -73.973366
2      69  40.712631 -73.900215
3      87  40.737850 -73.991562
4     979  40.590755 -74.066513

Nearest Venues:
                       Venue_ID         Category_Name   Latitude  Longitude  \
25336  4f58c483e4b0bdfd0f0acfed  Other Great Outdoors  40.710072 -73.989159   
621    4e53c5c1b0fb6cfefec56577        Home (private)  40.711456 -73.988876   
5275   4e8684590aaf882b7042b9d7        Home (private)  40.711456 -73.988876   
31680  4dce4acd8877851243f3d0ef        Home (private)  40.711456 -73.988876   
3186   4f56af10e4b0861b32cf96a6        Home (private)  40.711456 -73.988876   

       Distance_From_Central  
25336               0.001150  
621                 0.001326  
5275                0.001326  
31680               0.001326  
3186                0.001326  


  random_checkins = user_checkins.groupby('User_ID').apply(lambda x: x.sample(1)).reset_index(drop=True)


In [124]:
import folium

def visualize_random_checkins_and_venues(selected_checkins, nearest_venues):
    """
    Visualize the selected user check-ins and recommended venues on a map.

    Args:
        selected_checkins (pd.DataFrame): DataFrame containing randomly selected user check-ins with:
            - User_ID: Unique identifier for the user.
            - Latitude: Latitude of the check-in.
            - Longitude: Longitude of the check-in.
        nearest_venues (pd.DataFrame): DataFrame containing recommended venues with:
            - Venue_ID: Unique identifier for the venue.
            - Category_Name: Name of the venue's category.
            - Latitude: Latitude of the venue.
            - Longitude: Longitude of the venue.
            - Distance_From_Central (optional): Distance from the central meeting point.

    Returns:
        folium.Map: A map showing both user check-ins and recommended venues.
    """
    # Use the first user's check-in as the map's center
    center_lat = selected_checkins['Latitude'].iloc[0]
    center_long = selected_checkins['Longitude'].iloc[0]

    # Initialize the map
    combined_map = folium.Map(location=[center_lat, center_long], zoom_start=13)

    # Add user check-in markers
    for _, row in selected_checkins.iterrows():
        user_id = row['User_ID']
        lat = row['Latitude']
        long = row['Longitude']

        popup = folium.Popup(f"User ID: {user_id}<br>Latitude: {lat:.6f}<br>Longitude: {long:.6f}", max_width=300)
        folium.Marker(
            location=[lat, long],
            popup=popup,
            icon=folium.Icon(color='blue', icon='user', prefix='fa')  # Blue marker for users
        ).add_to(combined_map)

    # Add venue markers
    for _, row in nearest_venues.iterrows():
        venue_id = row['Venue_ID']
        category = row['Category_Name']
        lat = row['Latitude']
        long = row['Longitude']
        distance = row.get('Distance_From_Central', None)

        # Create popup with venue details
        details = f"Venue ID: {venue_id}<br>Category: {category}"
        if distance is not None:
            details += f"<br>Distance from Central: {distance:.2f} km"

        popup = folium.Popup(details, max_width=300)
        folium.Marker(
            location=[lat, long],
            popup=popup,
            icon=folium.Icon(color='red', icon='cutlery', prefix='fa')  # Red marker for venues
        ).add_to(combined_map)

    return combined_map


In [125]:
# Generate recommendations
selected_checkins, nearest_venues = recommend_meeting_place_random_checkins(user_ids=['470', '979', '69', '395', '87'], data=data, k=5)
#print(selected_checkins)
# Visualize the results
combined_map = visualize_random_checkins_and_venues(selected_checkins, nearest_venues)

# Display the map in the notebook
combined_map

  random_checkins = user_checkins.groupby('User_ID').apply(lambda x: x.sample(1)).reset_index(drop=True)
