In [None]:
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
#ES NECESARIO SUBIR LOS CSV AL COLAB
destinations_df = pd.read_csv("Expanded_Destinations.csv")
reviews_df = pd.read_csv("Final_Updated_Expanded_Reviews.csv")
userhistory_df = pd.read_csv("Final_Updated_Expanded_UserHistory.csv")
users_df = pd.read_csv("Final_Updated_Expanded_Users.csv")

In [None]:
# Tipado uniforme
for df in [users_df, userhistory_df, reviews_df]:
    df['UserID'] = df['UserID'].astype(str)
for df in [destinations_df, userhistory_df, reviews_df]:
    df['DestinationID'] = df['DestinationID'].astype(str)

# Combinar datasets

merged_df = pd.merge(userhistory_df, users_df, on='UserID', how='left')

merged_df = pd.merge(merged_df, destinations_df, on='DestinationID', how='left')

df = merged_df

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   HistoryID         999 non-null    int64  
 1   UserID            999 non-null    object 
 2   DestinationID     999 non-null    object 
 3   VisitDate         999 non-null    object 
 4   ExperienceRating  999 non-null    int64  
 5   Name_x            999 non-null    object 
 6   Email             999 non-null    object 
 7   Preferences       999 non-null    object 
 8   Gender            999 non-null    object 
 9   NumberOfAdults    999 non-null    int64  
 10  NumberOfChildren  999 non-null    int64  
 11  Name_y            999 non-null    object 
 12  State             999 non-null    object 
 13  Type              999 non-null    object 
 14  Popularity        999 non-null    float64
 15  BestTimeToVisit   999 non-null    object 
dtypes: float64(1), int64(4), object(11)
memory u

In [None]:
#Convertir la fecha a datetime
df['VisitDate'] = pd.to_datetime(df['VisitDate'])
#Eliminar columnas irrelevantes
df_copia = df.drop(columns=['HistoryID','Name_x','Email'])
#Normalizando columnas categóricas
df_copia['PreferencesList'] = df_copia['Preferences'].str.split(',')
df_copia['Type'] = df_copia['Type'].str.lower()
df_copia['BestTimeToVisit'] = df_copia['BestTimeToVisit'].str.lower()
df_copia['PopularityNormalizado'] = (df_copia['Popularity'] - df_copia['Popularity'].min()) / (df_copia['Popularity'].max() - df_copia['Popularity'].min())
df_copia['ExperienceRatingNormalizado'] = (df_copia['ExperienceRating'] - df_copia['ExperienceRating'].min()) / (df_copia['ExperienceRating'].max() - df_copia['ExperienceRating'].min())
df_copia.head()

Unnamed: 0,UserID,DestinationID,VisitDate,ExperienceRating,Preferences,Gender,NumberOfAdults,NumberOfChildren,Name_y,State,Type,Popularity,BestTimeToVisit,PreferencesList,PopularityNormalizado,ExperienceRatingNormalizado
0,525,760,2024-01-01,3,"City, Historical",Female,2,2,Leh Ladakh,Jammu and Kashmir,adventure,8.35218,apr-jun,"[City, Historical]",0.424836,0.5
1,184,532,2024-02-15,5,"Beaches, Historical",Male,1,2,Goa Beaches,Goa,beach,8.988127,nov-mar,"[Beaches, Historical]",0.743557,1.0
2,897,786,2024-03-20,2,"City, Historical",Female,1,2,Taj Mahal,Uttar Pradesh,historical,8.389206,nov-feb,"[City, Historical]",0.443392,0.25
3,470,660,2024-01-01,1,"Nature, Adventure",Male,2,1,Leh Ladakh,Jammu and Kashmir,adventure,7.923388,apr-jun,"[Nature, Adventure]",0.209936,0.0
4,989,389,2024-02-15,4,"Nature, Adventure",Male,2,1,Kerala Backwaters,Kerala,nature,9.409146,sep-mar,"[Nature, Adventure]",0.954561,0.75


In [None]:
user_item_matrix = df_copia.pivot_table(index='UserID', columns='DestinationID', values='ExperienceRatingNormalizado', fill_value=0)
svd = TruncatedSVD(n_components=20, random_state=42)
item_embeddings = svd.fit_transform(user_item_matrix.T)
destination_similarity = cosine_similarity(item_embeddings)

# Índices de destinos
destination_indices = {
    dest: idx for idx, dest in enumerate(user_item_matrix.columns)
}

# Base para contenido
destination_content = df_copia[[
    "DestinationID",
    "Name_y",
    "PopularityNormalizado",
    "PreferencesList",
    "Type",
    "BestTimeToVisit"
]].drop_duplicates(subset=["DestinationID"])

# MultiLabelBinarizer para PreferencesList
mlb = MultiLabelBinarizer()
prefs_encoded = pd.DataFrame(
    mlb.fit_transform(destination_content["PreferencesList"]),
    columns=[f"Pref_{c}" for c in mlb.classes_]
)
prefs_encoded.index = destination_content.index

# One-hot encode Type y BestTimeToVisit
dummies = pd.get_dummies(destination_content[["Type", "BestTimeToVisit"]])

# Concatenar
destination_content_encoded = pd.concat([
    destination_content[["DestinationID", "Name_y", "PopularityNormalizado"]],
    dummies,
    prefs_encoded
], axis=1)

# Asegurarse que todo sea float
for col in destination_content_encoded.columns:
    if destination_content_encoded[col].dtype in [bool, int]:
        destination_content_encoded[col] = destination_content_encoded[col].astype(float)


user_similarity = cosine_similarity(destination_content_encoded.drop(columns=["DestinationID", "Name_y"]) )
destinations_names = df_copia[['DestinationID', 'Name_y']].drop_duplicates()

In [None]:
def recommend_destinations(user_id,top_n=5,alpha=0.5):
  if user_id not in user_item_matrix.index:
    print('Usuario no válido')
    return 0
  visited_destinations = user_item_matrix.columns[user_item_matrix.loc[user_id] > 0]
  collab_scores = np.zeros(len(user_item_matrix.columns))
  content_scores = np.zeros(len(user_item_matrix.columns))

  for d in visited_destinations:
    idx = destination_indices[d]
    collab_scores += destination_similarity[idx]
    content_scores += user_similarity[idx]

  if len(visited_destinations) > 0:
    collab_scores /= len(visited_destinations)
    content_scores /= len(visited_destinations)

  hybrid_scores = alpha * collab_scores + (1 - alpha) * content_scores

  recommendations = pd.DataFrame({
      "DestinationID": user_item_matrix.columns,
      "HybridScore": hybrid_scores
  })

  recommendations = recommendations[~recommendations["DestinationID"].isin(visited_destinations)]
  recommendations = recommendations.merge(destinations_names, on="DestinationID", how="left")
  recommendations = recommendations.groupby("Name_y").agg({"HybridScore": "mean"}).reset_index()
  recommendations = recommendations.sort_values("HybridScore", ascending=False).head(top_n)
  recommendations = recommendations.rename(columns={"Name_y": "DestinationName"})
  return recommendations

In [None]:
def recommend_by_profile(user_profile, top_n=5):
    # One-hot de Type (puedes cambiar la lógica si quieres)
    type_cols = [c for c in destination_content_encoded.columns if c.startswith("Type_")]
    type_vec = pd.Series(0.0, index=type_cols)

    # One-hot de BestTimeToVisit (puedes ajustar la lógica)
    time_cols = [c for c in destination_content_encoded.columns if c.startswith("BestTimeToVisit_")]
    time_vec = pd.Series(0.0, index=time_cols)

    # MultiLabel de Preferences
    pref_cols = [c for c in destination_content_encoded.columns if c.startswith("Pref_")]
    pref_vec = pd.Series(0.0, index=pref_cols)
    for pref in user_profile["Preferences"].split(", "):
        col = f"Pref_{pref.strip()}"
        if col in pref_vec.index:
            pref_vec[col] = 1.0

    # Popularidad media
    popularity_mean = destination_content_encoded["PopularityNormalizado"].mean()

    # Concatenar
    user_vector = pd.concat([
        pd.Series({"PopularityNormalizado": popularity_mean}),
        type_vec,
        time_vec,
        pref_vec
    ]).to_frame().T

    # Ordenar columnas igual que en destino
    user_vector = user_vector[destination_content_encoded.drop(columns=["DestinationID", "Name_y"]).columns]

    # Convertir a float
    user_vector = user_vector.astype(float)

    # Similitud
    sim = cosine_similarity(
        destination_content_encoded.drop(columns=["DestinationID", "Name_y"]),
        user_vector
    ).flatten()

    recommendations = destination_content_encoded[["DestinationID", "Name_y"]].copy()
    recommendations["Similarity"] = sim
    recommendations = recommendations.sort_values("Similarity", ascending=False).head(top_n)
    recommendations = recommendations.rename(columns={"Name_y": "DestinationName"})
    return recommendations

In [None]:
# Recomendación híbrida basada en historial
sample_user = df_copia["UserID"].iloc[0]
recs_hybrid = recommend_destinations(sample_user, top_n=3, alpha=0.6)
print("\n=== Recomendaciones híbridas ===")
print(recs_hybrid)

# Recomendación basada en perfil
user_profile = {
    "Preferences": "Nature, Adventure",
    "Gender": "Female",
    "NumberOfAdults": 2,
    "NumberOfChildren": 1
}
recs_profile = recommend_by_profile(user_profile)
print("\n=== Recomendaciones basadas en perfil ===")
print(recs_profile)



=== Recomendaciones híbridas ===
  DestinationName  HybridScore
1     Jaipur City     0.155404
0     Goa Beaches     0.155140
4       Taj Mahal     0.147836

=== Recomendaciones basadas en perfil ===
    DestinationID    DestinationName  Similarity
860           987        Goa Beaches    0.601030
517           468        Jaipur City    0.600173
31            114  Kerala Backwaters    0.599672
207           684  Kerala Backwaters    0.599624
84            130         Leh Ladakh    0.599397


In [None]:
import random

# -------------------------------
# Generador automático de perfiles
# -------------------------------
def generate_random_profiles(n_profiles=50):
    possible_preferences = [
        "Nature", "Adventure", "Culture", "Relaxation", "Beach", "Gastronomy"
    ]
    genders = ["Female", "Male"]
    num_adults_options = [1, 2, 3]
    num_children_options = [0, 1, 2]

    profiles = []
    for _ in range(n_profiles):
        selected_prefs = random.sample(possible_preferences, k=random.randint(1,2))
        profile = {
            "Preferences": ", ".join(selected_prefs),
            "Gender": random.choice(genders),
            "NumberOfAdults": random.choice(num_adults_options),
            "NumberOfChildren": random.choice(num_children_options)
        }
        profiles.append(profile)

    return profiles

# -------------------------------
# Evaluación contra múltiples usuarios
# -------------------------------
def evaluate_profile_recommendations_against_multiple_users(user_profile, recommended_df, user_ids):
    recommended_names = set(recommended_df["DestinationName"].values)

    precisions = []
    recalls = []

    for uid in user_ids:
        actual_destinations = set(df_copia[df_copia["UserID"] == uid]["Name_y"].unique())

        if not actual_destinations:
            continue  # Saltar usuarios sin historial

        hits = recommended_names.intersection(actual_destinations)

        precision = len(hits) / len(recommended_names) if recommended_names else 0
        recall = len(hits) / len(actual_destinations) if actual_destinations else 0

        precisions.append(precision)
        recalls.append(recall)

    avg_precision = sum(precisions) / len(precisions) if precisions else None
    avg_recall = sum(recalls) / len(recalls) if recalls else None

    return avg_precision, avg_recall

# -------------------------------
# Generar perfiles de prueba
# -------------------------------
test_profiles = generate_random_profiles(n_profiles=50)

# -------------------------------
# Evaluar todos los perfiles
# -------------------------------
all_user_ids = df_copia["UserID"].unique()

all_precisions = []
all_recalls = []

for idx, profile in enumerate(test_profiles):
    # Generar recomendaciones para este perfil
    recs_profile = recommend_by_profile(profile, top_n=5)

    # Evaluar contra todos los usuarios reales
    avg_precision, avg_recall = evaluate_profile_recommendations_against_multiple_users(
        profile, recs_profile, all_user_ids
    )

    all_precisions.append(avg_precision)
    all_recalls.append(avg_recall)

    print(f"Perfil {idx+1}: Precision={avg_precision:.3f}, Recall={avg_recall:.3f}")

# -------------------------------
# Métricas promedio globales
# -------------------------------
overall_precision = sum(all_precisions) / len(all_precisions)
overall_recall = sum(all_recalls) / len(all_recalls)

print("\n=== Métricas de evaluación global (promedio sobre perfiles aleatorios) ===")
print(f"Precision promedio global: {overall_precision:.3f}")
print(f"Recall promedio global: {overall_recall:.3f}")


Perfil 1: Precision=0.282, Recall=0.802
Perfil 2: Precision=0.278, Recall=0.591
Perfil 3: Precision=0.278, Recall=0.591
Perfil 4: Precision=0.278, Recall=0.591
Perfil 5: Precision=0.282, Recall=0.802
Perfil 6: Precision=0.278, Recall=0.591
Perfil 7: Precision=0.282, Recall=0.802
Perfil 8: Precision=0.282, Recall=0.802
Perfil 9: Precision=0.282, Recall=0.802
Perfil 10: Precision=0.278, Recall=0.591
Perfil 11: Precision=0.278, Recall=0.591
Perfil 12: Precision=0.278, Recall=0.591
Perfil 13: Precision=0.278, Recall=0.591
Perfil 14: Precision=0.278, Recall=0.591
Perfil 15: Precision=0.278, Recall=0.591
Perfil 16: Precision=0.278, Recall=0.591
Perfil 17: Precision=0.278, Recall=0.591
Perfil 18: Precision=0.282, Recall=0.802
Perfil 19: Precision=0.278, Recall=0.591
Perfil 20: Precision=0.278, Recall=0.591
Perfil 21: Precision=0.278, Recall=0.591
Perfil 22: Precision=0.282, Recall=0.802
Perfil 23: Precision=0.278, Recall=0.591
Perfil 24: Precision=0.278, Recall=0.591
Perfil 25: Precision=0.27

In [None]:
!jupyter nbconvert --to markdown 'Trabajo_3_punto_3.ipynb'

[NbConvertApp] Converting notebook Trabajo_3_punto_3.ipynb to markdown
[NbConvertApp] Writing 24237 bytes to Trabajo_3_punto_3.md
