In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import NearestNeighbors


# Load dataset
bookingsdata_frame = pd.read_csv(r"C:\Users\lasya\OneDrive\Desktop\DS PROJECT\datascience project\bookings.csv")
customersdata_frame = pd.read_csv(r"C:\Users\lasya\OneDrive\Desktop\DS PROJECT\datascience project\customers.csv")
routesdata_frame = pd.read_csv(r"C:\Users\lasya\OneDrive\Desktop\DS PROJECT\datascience project\routes.csv")

# Merge datasets

# Merge bookings with customers on 'Customer_ID'
merged_df = pd.merge(bookingsdata_frame, customersdata_frame, on="Customer_ID", how="inner")

# Merge the result with routes on 'Route_ID'
merged_df = pd.merge(merged_df, routesdata_frame, on="Route_ID", how="inner")
merged_df.columns

# Drop irrelevant columns
merged_df = merged_df.drop(columns=['Booking_ID', 'Route_ID', 'Booking_Date', 'Travel_Date', 'Name'])

# Select Features and Target
X = merged_df.drop(columns=['Destination']) # features
y = 'Destination' # target
                           

# Encoding categorical features using Label Encoder
label_encoder = LabelEncoder()
categorical_cols = ['Age_Group', 'Gender', 'Location', 'Origin', 'Preferred_Bus_Type', 'Bus_Type']

for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])


# Aggregate features per customer (to avoid duplicate entries)
# Group by 'Customer_ID' and take the mean for numerical columns
customer_features = X.groupby('Customer_ID').agg({
    'Seats_Booked': 'sum',  # Total seats booked
    'Rating': 'mean',  # Average rating
    'Age_Group': 'mean',  # Encoded age group (mean is fine since it's label-encoded)
    'Gender': 'mean',  # Encoded gender (mean works for binary encoding)
    'Location': 'mean',  # Encoded location
    'Preferred_Bus_Type': 'mean',  # Encoded bus preference
    'Origin': 'mean',  # Encoded origin
    'Distance (km)': 'mean',  # Average distance
    'Duration (hours)': 'mean',  # Average duration
    'Bus_Type': 'mean',  # Encoded bus type
    'Ticket_Price (INR)': 'mean'  # Average ticket price
}).reset_index()


# Feature Scaling (Standardize Features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(customer_features.drop(columns=['Customer_ID']))


# Train KNN for Recommendation

from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(X_scaled)

# Evaluation
'''
for a KNN-based recommendation system, no need of traditional metrics like accuracy, precision, 
recall, or RMSE because:

* we are not predicting a single target value but retrieving similar customers.
* KNN is an unsupervised approach for recommendations, so there's no ground truth.
* The best way to evaluate is by checking if the recommendations make sense for users.
'''

# Recommend Top 5 Destinations for a Given Customer
'''
* Find the index of the given customer
* Find 5 most similar customers using knn
* Get the Customer_IDs of similar users
* Find the destinations of these similar customers
* Retrieve destinations already visited by the customer
* Exclude destinations already visited
* Get the top 5 most common destinations
'''

def recommend_destinations(customer_id, customer_features, merged_df, knn_model):
    """
    Recommend top 5 destinations for a given customer ID using KNN.

    Parameters:
        customer_id (int): The Customer_ID to recommend destinations for.
        customer_features (DataFrame): Processed customer features after encoding & grouping.
        merged_df (DataFrame): Original dataset with 'Customer_ID' and 'Destination'.
        knn_model (NearestNeighbors): Trained KNN model.

    Returns:
        List of top 5 recommended destinations.
    """
    
    # Check if customer exists
    if customer_id not in customer_features['Customer_ID'].values:
        print(f"\nCustomer ID {customer_id} not found in the database.")
        return " "
    else:
        print(f"\nRecommendations for Customer ID - {customer_id}:")

    # Get the index of the given customer
    customer_idx = customer_features[customer_features['Customer_ID'] == customer_id].index[0]

    # Find 5 nearest customers
    distances, indices = knn.kneighbors(X_scaled[[customer_idx]], n_neighbors=5)

    # Get similar customers' IDs
    similar_customers = customer_features.iloc[indices[0]]['Customer_ID'].values

    # Find their destinations
    recommended_destinations = merged_df[merged_df['Customer_ID'].isin(similar_customers)]['Destination'].unique()
    
    # Retrieve destinations already visited by the customer
    visited_destinations = set(merged_df[merged_df['Customer_ID'] == customer_id]['Destination'].unique())

    # Exclude destinations already visited
    recommended_destinations = [dest for dest in recommended_destinations if dest not in visited_destinations]
    
    # Get the top 5 most common destinations
    top_destinations = pd.Series(recommended_destinations).value_counts().head(5).index.tolist()
    '''
    value_counts() automatically sorts the values in descending order by default
    It sorts them in descending order (most frequent first).
    .head(5) selects the top 5 most common destinations.
    .index.tolist() converts them to a list of destination names.
    '''
    
    return top_destinations

# Example: Recommend destinations using KNN
recommended_destinations = recommend_destinations('C12345', customer_features, merged_df, knn)
print(recommended_destinations)

recommended_destinations = recommend_destinations('C080', customer_features, merged_df, knn)
print(recommended_destinations)


#customer_id='C080'



Customer ID C12345 not found in the database.
 

Recommendations for Customer ID - C080:
['Lake Jamesburgh', 'Jessicabury', 'Port James', 'New Marissashire', 'West Joelborough']
