## 1. Load and Inspect the Data
Load the data and check its structure and basic statistics.

In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv('dataset_NYC.txt', sep='\t', header=None, encoding='ISO-8859-1')
data.columns = ['User_ID', 'Venue_ID', 'Venue_Category_ID', 'Venue_Category_Name', 'Latitude', 'Longitude', 'Timezone_Offset', 'UTC_Time']

# Inspect the first few rows
print(data.head())

# Check dataset dimensions
print("Dataset shape:", data.shape)

# Summary statistics
print(data.describe())

# Check for missing values
print("Missing values per column:\n", data.isnull().sum())

FileNotFoundError: [Errno 2] No such file or directory: 'dataset_NYC.txt'

## 2.Handle Missing Values
Analyze the number of unique users, venues, and categories.

### a) Handle Venue_Category_Name:

In [8]:
# Fill missing categories with the most frequent category
most_frequent_category = data['Venue_Category_Name'].mode()[0]
data['Venue_Category_Name'] = data['Venue_Category_Name'].fillna(most_frequent_category)

### b) Drop Rows with Missing Coordinates:

In [9]:
# Remove rows with missing Latitude or Longitude
data = data.dropna(subset=['Latitude', 'Longitude'])
print("Dataset shape:", data.shape)

Dataset shape: (226913, 8)


## 3.  Remove Outliers
### a) Check Coordinate Bounds:

Use approximate bounds for New York City:

In [7]:
# Filter data within NYC bounds
data = data[(data['Latitude'] >= 40.5) & (data['Latitude'] <= 41) & 
            (data['Longitude'] >= -74.25) & (data['Longitude'] <= -73.5)]

### b) Remove Rare or Extreme Categories:

If any categories have very few entries, decide whether to keep them:

In [13]:
# Check frequency of categories
category_counts = data['Venue_Category_Name'].value_counts()
print(category_counts)

# Optional: Drop categories with less than 5 occurrences
rare_categories = category_counts[category_counts < 100].index
data = data[~data['Venue_Category_Name'].isin(rare_categories)]

Venue_Category_Name
Bar                         15960
Home (private)              15231
Office                      12735
Subway                       9348
Gym / Fitness Center         9127
                            ...  
Portuguese Restaurant           9
Bike Rental / Bike Share        9
Distillery                      8
Internet Cafe                   6
Gluten-free Restaurant          5
Name: count, Length: 244, dtype: int64


In [14]:
category_counts = data['Venue_Category_Name'].value_counts()
print(category_counts)

Venue_Category_Name
Bar                     15960
Home (private)          15231
Office                  12735
Subway                   9348
Gym / Fitness Center     9127
                        ...  
Market                    113
Factory                   112
Record Shop               112
Zoo                       104
Pool Hall                 100
Name: count, Length: 167, dtype: int64


## 4. Transform Features

### a) Normalize Latitude and Longitude:

In [15]:
from sklearn.preprocessing import StandardScaler

# Normalize coordinates
scaler = StandardScaler()
data[['Latitude', 'Longitude']] = scaler.fit_transform(data[['Latitude', 'Longitude']])

### b) Encode Categories with One-Hot Encoding:

In [16]:
# One-hot encode venue categories
venue_categories = pd.get_dummies(data['Venue_Category_Name'], prefix='Category')
data = pd.concat([data, venue_categories], axis=1)

## 5. Enhance Dataset

### a) Add Popularity as a Feature:

Calculate the number of check-ins per venue:

In [20]:
venue_popularity = data['Venue_ID'].value_counts()
data['Venue_Popularity'] = data['Venue_ID'].map(venue_popularity).fillna(0)

### b) Add Proximity to a Central Location:

Compute distances from the normalized NYC center (approx. [0, 0]):

In [21]:
# Compute proximity
data['Proximity'] = (data['Latitude']**2 + data['Longitude']**2)**0.5

In [18]:
# Inspect the first few rows
print(data.head())

   User_ID                  Venue_ID         Venue_Category_ID  \
0      470  49bbd6c0f964a520f4531fe3  4bf58dd8d48988d127951735   
1      979  4a43c0aef964a520c6a61fe3  4bf58dd8d48988d1df941735   
2       69  4c5cc7b485a1e21e00d35711  4bf58dd8d48988d103941735   
3      395  4bc7086715a7ef3bef9878da  4bf58dd8d48988d104941735   
4       87  4cf2c5321d18a143951b5cec  4bf58dd8d48988d1cb941735   

   Venue_Category_Name  Latitude  Longitude  Timezone_Offset  \
0  Arts & Crafts Store -0.471145  -0.337254             -240   
1               Bridge -2.026610  -0.825377             -240   
2       Home (private) -0.521365   1.065434             -240   
3       Medical Center -0.122184  -0.101785             -240   
4           Food Truck -0.191829  -0.185582             -240   

                         UTC_Time  Category_Airport  \
0  Tue Apr 03 18:00:09 +0000 2012             False   
1  Tue Apr 03 18:00:25 +0000 2012             False   
2  Tue Apr 03 18:02:24 +0000 2012             False  

### c) Add Time Features
Extract useful time features from UTC_Time:

In [24]:
# Convert UTC_Time to datetime format
data['UTC_Time'] = pd.to_datetime(data['UTC_Time'], errors='coerce')

# Extract time-based features
data['Hour'] = data['UTC_Time'].dt.hour
data['Day'] = data['UTC_Time'].dt.day
data['Month'] = data['UTC_Time'].dt.month

## 6. Prepare Data for Recommendations

### a) Create User Profiles:

Aggregate user activity across all categories:

In [25]:
# Create user profiles by summing category activities
user_profiles = data.groupby('User_ID')[venue_categories.columns].sum()
print(user_profiles.head())

         Category_Airport  Category_American Restaurant  Category_Arcade  \
User_ID                                                                    
1                       3                            10                0   
2                       0                             4                0   
3                       8                            20                0   
4                       0                             1                0   
5                       0                             0                0   

         Category_Art Gallery  Category_Art Museum  \
User_ID                                              
1                           0                    0   
2                           0                    0   
3                           0                    5   
4                           0                    1   
5                           0                    0   

         Category_Arts & Crafts Store  Category_Asian Restaurant  \
User_ID             

### b) Filter Unvisited Venues for Recommendation:

Identify venues not visited by a specific user:

In [26]:
# Example: User-specific data
user_id = 293  # Replace with your user ID
user_data = data[data['User_ID'] == user_id]

# Identify unvisited venues
visited_venues = user_data['Venue_ID'].unique()
unvisited_venues = data[~data['Venue_ID'].isin(visited_venues)]

#### Check the variance of user profiles and venue vectors:

In [27]:
print("User profile variance:", user_profiles.var(axis=1).head())
print("Venue vectors variance:", venue_categories.var(axis=0).sum())

User profile variance: User_ID
1     4.477310
2     6.055840
3     5.063992
4    29.154318
5     6.995311
dtype: float64
Venue vectors variance: 0.9748250510218635


In [28]:
# Step 6: Filter unvisited venues for a specific user
def get_unvisited_venues(user_id, category_name):
    user_data = data[data['User_ID'] == user_id]
    visited_venues = user_data['Venue_ID'].unique()  # Venues the user has visited
    unvisited_venues = data[(~data['Venue_ID'].isin(visited_venues)) & (data['Venue_Category_Name'] == category_name)]
    return unvisited_venues

# Example: Get unvisited "Restaurant" venues for user 470
user_id = 470
category_name = 'Restaurant'
unvisited_venues = get_unvisited_venues(user_id, category_name)
print("Unvisited venues for User_ID 470 in category 'Restaurant':")
print(unvisited_venues[['Venue_ID', 'Venue_Category_Name', 'Latitude', 'Longitude', 'Venue_Popularity']].head())

Unvisited venues for User_ID 470 in category 'Restaurant':
                      Venue_ID Venue_Category_Name  Latitude  Longitude  \
17    4a689777f964a520a0ca1fe3          Restaurant  0.016983  -0.058210   
685   4d76b2bf18cc5941fd7cf33f          Restaurant -0.171242  -0.349786   
773   4e45bca962845e1d3c88924e          Restaurant -1.090270  -0.028430   
1989  4bd57bb44e32d13a8dd9c080          Restaurant -1.715480   0.648282   
2880  4bb9c82e7421a59301ffc240          Restaurant -1.606022  -0.189709   

      Venue_Popularity  
17                  36  
685                  7  
773                 11  
1989                21  
2880                 1  


In [29]:
print("User profile variance:", user_profiles.var(axis=1).head())
print("Venue vectors variance:", venue_categories.var(axis=0).sum())

User profile variance: User_ID
1     4.477310
2     6.055840
3     5.063992
4    29.154318
5     6.995311
dtype: float64
Venue vectors variance: 0.9748250510218635


In [1]:
preprocessed_file = 'preprocessed_dataset.csv'
data.to_csv(preprocessed_file, index=False)
print(f"Preprocessed dataset saved to {preprocessed_file}")

NameError: name 'data' is not defined