# Hybrid music recommender system

In [None]:
import numpy as np
import pandas as pd

### Importing the data

In [2]:
users_history = pd.read_csv(
    "/kaggle/input/million-song-dataset-spotify-lastfm/User Listening History.csv"
)

music_info = pd.read_csv(
    "/kaggle/input/million-song-dataset-spotify-lastfm/Music Info.csv"
)

In [3]:
users_history.shape

In [4]:
music_info.shape

In [5]:
music_info.info()

In [6]:
users_history[1:4]

In [7]:
music_info[1:4]

### Testing synthetic data generation with spotify demographic based info

In [8]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

### Step 1: First cluster the songs based on audio features

In [9]:
# Select audio features for clustering
audio_features = [
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
]


# Extract genre information
def extract_main_genres(tag_string):
    if pd.isna(tag_string):
        return []

    tags = str(tag_string).lower().split(",")
    genres = []
    main_genres = [
        "rock",
        "pop",
        "hip hop",
        "rap",
        "electronic",
        "dance",
        "jazz",
        "classical",
        "r&b",
        "country",
        "indie",
        "folk",
    ]

    for genre in main_genres:
        if any(genre in tag for tag in tags):
            genres.append(genre)

    return genres if genres else ["other"]


# Add genre features to music_info
music_info["main_genres"] = music_info["tags"].apply(extract_main_genres)

# One-hot encode main genres
for genre in [
    "rock",
    "pop",
    "hip hop",
    "rap",
    "electronic",
    "dance",
    "jazz",
    "classical",
    "r&b",
    "country",
    "indie",
    "folk",
    "other",
]:
    music_info[f"genre_{genre}"] = music_info["main_genres"].apply(
        lambda x: 1 if genre in x else 0
    )

# Combine audio features and genre information for clustering
genre_features = [col for col in music_info.columns if col.startswith("genre_")]
music_features_df = music_info[["track_id"] + audio_features + genre_features].copy()

# Normalize features for song clustering
scaler = StandardScaler()
features_to_scale = audio_features + genre_features
music_features_df_scaled = music_features_df.copy()
music_features_df_scaled[features_to_scale] = scaler.fit_transform(
    music_features_df[features_to_scale]
)

# Cluster songs (we'll create 20 song clusters)
n_song_clusters = 20
song_kmeans = KMeans(n_clusters=n_song_clusters, random_state=42, n_init=10)
music_features_df_scaled["song_cluster"] = song_kmeans.fit_predict(
    music_features_df_scaled[features_to_scale]
)
song_clusters = music_features_df_scaled[["track_id", "song_cluster"]]

# Analyze the song clusters to understand what each represents
cluster_profile = []
for cluster_id in range(n_song_clusters):
    cluster_tracks = music_features_df[
        music_features_df_scaled["song_cluster"] == cluster_id
    ]

    # Get average audio features
    avg_features = cluster_tracks[audio_features].mean().to_dict()

    # Get most common genres
    genre_counts = cluster_tracks[genre_features].sum()
    top_genres = genre_counts.nlargest(3).index.tolist()
    top_genres = [g.replace("genre_", "") for g in top_genres]

    cluster_profile.append(
        {
            "cluster_id": cluster_id,
            "size": len(cluster_tracks),
            "top_genres": top_genres,
            **avg_features,
        }
    )

song_cluster_profile = pd.DataFrame(cluster_profile)
print("Song Cluster Profiles:")
print(
    song_cluster_profile[
        ["cluster_id", "size", "top_genres", "danceability", "energy", "valence"]
    ].head()
)

### Step 2: Create user preference distributions across song clusters


In [10]:
user_song_clusters = users_history.merge(song_clusters, on="track_id")

# Create distribution of song clusters for each user (weighted by playcount)
user_cluster_distributions = (
    user_song_clusters.groupby(["user_id", "song_cluster"])["playcount"]
    .sum()
    .reset_index()
)
total_plays = (
    user_cluster_distributions.groupby("user_id")["playcount"].sum().reset_index()
)
total_plays.rename(columns={"playcount": "total_playcount"}, inplace=True)
user_cluster_distributions = user_cluster_distributions.merge(total_plays, on="user_id")
user_cluster_distributions["percentage"] = (
    user_cluster_distributions["playcount"]
    / user_cluster_distributions["total_playcount"]
) * 100

# Create a pivot table to get user distribution vectors
user_vectors = user_cluster_distributions.pivot_table(
    index="user_id", columns="song_cluster", values="percentage", fill_value=0
).reset_index()

### Step 3: Calculate diversity metrics for each user


In [11]:
def calculate_diversity(row):
    # Convert percentages to probabilities
    probs = (
        np.array([float(x) for x in row[1:]], dtype=float) / 100.0
    )  # Exclude user_id column and convert to numpy array
    probs = probs[probs > 0]  # Only consider non-zero probabilities

    # Shannon entropy as diversity measure
    if len(probs) > 0:
        entropy = -np.sum(
            probs * np.log2(probs + np.finfo(float).eps)
        )  # Add small epsilon to avoid log(0)
        return entropy
    return 0


# Calculate listening diversity for each user
user_vectors["listening_diversity"] = user_vectors.apply(calculate_diversity, axis=1)

### Step 4: Cluster users based on their listening distributions


In [12]:
user_feature_cols = [col for col in user_vectors.columns if isinstance(col, int)]
user_scaler = StandardScaler()
user_scaled_features = user_scaler.fit_transform(user_vectors[user_feature_cols])

# We'll create 8 user clusters
n_user_clusters = 8
user_kmeans = KMeans(n_clusters=n_user_clusters, random_state=42, n_init=10)
user_vectors["user_cluster"] = user_kmeans.fit_predict(user_scaled_features)

# Analyze user clusters to understand what each represents
user_cluster_profiles = []
for cluster_id in range(n_user_clusters):
    cluster_users = user_vectors[user_vectors["user_cluster"] == cluster_id]

    # Get average distribution across song clusters
    avg_distribution = cluster_users[user_feature_cols].mean().to_dict()

    # Find top song clusters for this user cluster
    top_3_song_clusters = sorted(
        avg_distribution.items(), key=lambda x: x[1], reverse=True
    )[:3]

    # Map those to the song cluster profiles
    top_genres = []
    for song_cluster, percentage in top_3_song_clusters:
        top_genres.extend(
            song_cluster_profile.loc[
                song_cluster_profile["cluster_id"] == song_cluster, "top_genres"
            ].iloc[0]
        )

    # Average diversity
    avg_diversity = cluster_users["listening_diversity"].mean()

    user_cluster_profiles.append(
        {
            "user_cluster_id": cluster_id,
            "size": len(cluster_users),
            "top_song_clusters": [sc[0] for sc in top_3_song_clusters],
            "top_genres": list(set(top_genres)),
            "avg_diversity": avg_diversity,
        }
    )

user_cluster_profile_df = pd.DataFrame(user_cluster_profiles)
print("\nUser Cluster Profiles:")
print(user_cluster_profile_df.head())

### Step 5: Assign demographics based on user clusters and known Spotify demographics

In [None]:
# Define demographic distributions
age_groups = [
    (18, 24),  # 31.51%
    (25, 34),  # 31.41%
    (35, 44),  # ~15%
    (45, 54),  # ~10%
    (55, 64),  # ~8%
    (65, 100),  # ~4%
]

# Base probabilities from Spotify data
base_age_probabilities = [0.3151, 0.3141, 0.15, 0.10, 0.08, 0.04]
base_gender_probabilities = {"Female": 0.56, "Male": 0.44}
base_region_probabilities = {
    "Europe": 0.27,
    "North America": 0.28,
    "Latin America": 0.22,
    "Rest of World": 0.23,
}


# Adjust demographics based on user cluster characteristics
# This is where we encode the correlations between music taste and demographics
def adjust_demographics_by_genres(cluster_profile):
    """Adjust demographic probabilities based on genre preferences"""
    age_adj = base_age_probabilities.copy()
    gender_adj = base_gender_probabilities.copy()
    region_adj = base_region_probabilities.copy()

    # These adjustments are based on research about music preferences by demographic
    # Implement correlations between genres and demographics
    genres = cluster_profile["top_genres"]

    # Age correlations
    if any(g in genres for g in ["hip hop", "rap", "dance", "electronic"]):
        # Increase younger age groups probability
        age_adj[0] *= 1.3  # 18-24
        age_adj[1] *= 1.2  # 25-34

    if any(g in genres for g in ["rock", "indie", "alternative"]):
        # Slight increase in middle age groups
        age_adj[1] *= 1.15  # 25-34
        age_adj[2] *= 1.2  # 35-44

    if any(g in genres for g in ["jazz", "classical", "folk"]):
        # Increase older age groups probability
        age_adj[3] *= 1.3  # 45-54
        age_adj[4] *= 1.3  # 55-64
        age_adj[5] *= 1.5  # 65+

    # Gender correlations
    if any(g in genres for g in ["hip hop", "rap", "rock"]):
        gender_adj["Male"] *= 1.15

    if any(g in genres for g in ["pop", "dance"]):
        gender_adj["Female"] *= 1.1

    # Region correlations
    if any(g in genres for g in ["hip hop", "rap", "r&b"]):
        region_adj["North America"] *= 1.2

    if any(g in genres for g in ["electronic", "dance"]):
        region_adj["Europe"] *= 1.2

    if any(g in genres for g in ["latin"]):
        region_adj["Latin America"] *= 1.5

    # Also consider diversity factor (high diversity = different pattern)
    diversity = cluster_profile["avg_diversity"]
    if diversity > 2.5:  # High diversity
        # More balanced across demographics
        age_adj = [0.25, 0.25, 0.15, 0.15, 0.1, 0.1]

    # Normalize to ensure probabilities sum to 1
    age_adj = np.array(age_adj)
    age_adj = age_adj / age_adj.sum()

    gender_values = np.array(list(gender_adj.values()))
    gender_values = gender_values / gender_values.sum()
    gender_adj = {k: gender_values[i] for i, k in enumerate(gender_adj.keys())}

    region_values = np.array(list(region_adj.values()))
    region_values = region_values / region_values.sum()
    region_adj = {k: region_values[i] for i, k in enumerate(region_adj.keys())}

    return age_adj, gender_adj, region_adj


# Create final user demographics
countries_by_region = {
    "Europe": [
        "UK",
        "Germany",
        "France",
        "Spain",
        "Italy",
        "Sweden",
        "Netherlands",
        "Poland",
    ],
    "North America": ["USA", "Canada"],
    "Latin America": ["Brazil", "Mexico", "Argentina", "Colombia", "Chile"],
    "Rest of World": [
        "Japan",
        "Australia",
        "India",
        "South Korea",
        "South Africa",
        "UAE",
    ],
}

user_demographics = []

for cluster_id, cluster_profile in user_cluster_profile_df.iterrows():
    # Get users in this cluster
    cluster_users = user_vectors[
        user_vectors["user_cluster"] == cluster_profile["user_cluster_id"]
    ]["user_id"].values
    n_cluster_users = len(cluster_users)

    # Adjust demographic probabilities based on cluster profile
    age_probs, gender_probs, region_probs = adjust_demographics_by_genres(
        cluster_profile
    )

    # Assign demographics to each user in the cluster
    for user_id in cluster_users:
        # Age
        age_group_idx = np.random.choice(len(age_groups), p=age_probs)
        age_range = age_groups[age_group_idx]
        age = np.random.randint(age_range[0], age_range[1] + 1)

        # Gender
        gender = np.random.choice(
            list(gender_probs.keys()), p=list(gender_probs.values())
        )

        # Region
        region = np.random.choice(
            list(region_probs.keys()), p=list(region_probs.values())
        )

        # Country within region
        country = np.random.choice(countries_by_region[region])

        # Listening hours (correlated with age and gender)
        base_hours = 0
        if age < 25:
            base_hours = 25 + np.random.normal(5, 3)
        elif age < 35:
            base_hours = 20 + np.random.normal(4, 3)
        elif age < 45:
            base_hours = 15 + np.random.normal(5, 2)
        else:
            base_hours = 10 + np.random.normal(5, 2)

        # Gen Z women tend to listen more (30 hrs vs 24 hrs for men)
        if age < 25 and gender == "Female":
            base_hours *= 1.2

        monthly_hours = max(5, min(60, base_hours))  # Cap between 5 and 60 hours

        # Get diversity from user vectors
        user_diversity = user_vectors.loc[
            user_vectors["user_id"] == user_id, "listening_diversity"
        ].values[0]

        user_demographics.append(
            {
                "user_id": user_id,
                "age": age,
                "gender": gender,
                "region": region,
                "country": country,
                "monthly_hours": monthly_hours,
                "user_cluster": cluster_profile["user_cluster_id"],
                "listening_diversity": user_diversity,
                "top_genres": cluster_profile["top_genres"],
            }
        )

# Create final demographics dataframe
user_demographics_df = pd.DataFrame(user_demographics)

# Generate some visualizations for demographic distribution
plt.figure(figsize=(15, 10))

# Age distribution
plt.subplot(2, 2, 1)
sns.histplot(user_demographics_df["age"], bins=20)
plt.title("Age Distribution")

# Gender distribution
plt.subplot(2, 2, 2)
sns.countplot(x="gender", data=user_demographics_df)
plt.title("Gender Distribution")

# Region distribution
plt.subplot(2, 2, 3)
sns.countplot(y="region", data=user_demographics_df)
plt.title("Region Distribution")

# Listening hours by age group
plt.subplot(2, 2, 4)
user_demographics_df["age_group"] = pd.cut(
    user_demographics_df["age"],
    [18, 24, 34, 44, 54, 64, 100],
    labels=["18-24", "25-34", "35-44", "45-54", "55-64", "65+"],
)
sns.boxplot(x="age_group", y="monthly_hours", data=user_demographics_df)
plt.title("Listening Hours by Age Group")

plt.tight_layout()
plt.savefig("demographic_distributions.png")

# Save the final synthetic demographics
user_demographics_df.to_csv("rich_synthetic_user_demographics.csv", index=False)

print(f"\nCreated rich synthetic demographics for {len(user_demographics_df)} users")
print(user_demographics_df.head())

# Additional analysis - show listening patterns by demographic
print("\nAverage listening diversity by age group:")
print(user_demographics_df.groupby("age_group")["listening_diversity"].mean())

print("\nTop user clusters by gender:")
print(
    user_demographics_df.groupby(["gender", "user_cluster"])
    .size()
    .reset_index()
    .rename(columns={0: "count"})
    .sort_values(["gender", "count"], ascending=[True, False])
)

### Grouping the songs of each user

In [8]:
user_song_list = users_history.groupby('user_id', observed=True)[['track_id', 'playcount']].apply(lambda x: list(zip(x['track_id'], x['playcount']))).to_dict()

In [9]:
dict(list(user_song_list.items())[:5])

###  Removing the users who have listened to less than 50 songs

In [10]:
user_song_list = {user: songs for user, songs in user_song_list.items() if len(songs) >= 50}

In [11]:
dict(list(user_song_list.items())[:2])

In [12]:
len(user_song_list)

### Removing the data related to users that have listened to less than 50 songs

In [13]:
users_history = users_history[users_history['user_id'].isin(user_song_list.keys())] 
users_history.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix,coo_matrix
from annoy import AnnoyIndex

### Selecting the columns that represent the numerical values for each song

In [None]:
feature_columns = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                   'instrumentalness', 'liveness', 'valence', 'tempo']
#numerical_features = music_info[feature_columns].values



### Normalizing the features

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_numerical_features = scaler.fit_transform(music_info[feature_columns])

### Building the annoy index

In [None]:
feature_length = normalized_numerical_features.shape[1]
annoy_index = AnnoyIndex(feature_length, 'angular')

for idx, vector in enumerate(normalized_numerical_features):
    annoy_index.add_item(idx, vector)

annoy_index.build(50)

### The 6 most close songs to the first song

In [None]:
annoy_index.get_nns_by_item(0, 7)[1:]

### Splitting the data

In [None]:
train, test = train_test_split(users_history, test_size=0.2, random_state=42)

### Creating mappings so we can find a song_id by index and vice versa

In [None]:
# For collaborative filtering (using users_history)
train['user_id'] = train['user_id'].astype('category')
train['track_id'] = train['track_id'].astype('category')

# Create mappings for user_id and track_id
cf_user_id_mapping = dict(enumerate(train['user_id'].cat.categories))
cf_track_id_mapping = dict(enumerate(train['track_id'].cat.categories))
cf_user_id_reverse_mapping = {v: k for k, v in cf_user_id_mapping.items()}
cf_track_id_reverse_mapping = {v: k for k, v in cf_track_id_mapping.items()}

# For content-based filtering (using music_info)
music_info['track_id'] = music_info['track_id'].astype('category')

cb_track_id_mapping = dict(enumerate(music_info['track_id'].cat.categories))
cb_track_id_reverse_mapping = {v: k for k, v in cb_track_id_mapping.items()}



### Using SVD

In [None]:
# Create Sparse User-Item Interaction Matrix
user_item_sparse = coo_matrix((train['playcount'],
                               (train['user_id'].cat.codes,
                                train['track_id'].cat.codes)))

# Apply SVD on the Sparse Matrix
svd = TruncatedSVD(n_components=10, random_state=42)
user_factors = svd.fit_transform(user_item_sparse)
item_factors = svd.components_.T

In [None]:
user_item_sparse.shape

In [None]:
def recommend_songs_hybrid(user_id, user_item_matrix, user_factors, item_factors, music_info, annoy_index, n_recommendations=5):
    # Check if the user_id exists in the mapping
    user_code = cf_user_id_reverse_mapping.get(user_id)
    if user_code is None:
        print(f"User ID {user_id} not found in the user-item matrix.")
        return []
    
    # Collaborative Filtering Recommendations
    cf_predictions = np.dot(user_factors[user_code, :], item_factors.T)
    cf_indices = np.argsort(cf_predictions)[::-1]
    cf_recommended_tracks = [cf_track_id_mapping[i] for i in cf_indices[:n_recommendations]]
    
    #print(f"Collaborative Filtering Recommendations for user {user_id}: {cf_recommended_tracks}")
    
    # Content-Based Filtering Recommendations
    cb_recommended_tracks = []
    for track in cf_recommended_tracks:
        track_code = cb_track_id_reverse_mapping.get(track)
        if track_code is not None:
            similar_tracks = annoy_index.get_nns_by_item(track_code, 4)[1:]  # Get 4 similar tracks
            for i in similar_tracks:
                try:
                    cb_recommended_tracks.append(cb_track_id_mapping[i])
                except KeyError:
                    print(f"KeyError: Index {i} not found in cb_track_id_mapping")
    
    #print(f"Content-Based Filtering Recommendations for user {user_id}: {cb_recommended_tracks}")
    
    hybrid_recommended_tracks = list(set(cf_recommended_tracks + cb_recommended_tracks))
    
    #print(f"Hybrid Recommendations for user {user_id}: {hybrid_recommended_tracks}")
    
    return hybrid_recommended_tracks

In [None]:
def evaluate_model_hybrid_user(user, user_test_data, user_train_data, user_item_matrix, user_factors, item_factors, music_info, annoy_index, n_recommendations=5):
    precision = 0.0
    recall = 0.0
    if user in user_train_data:
            true_tracks = user_test_data[user]
            recommended_tracks = recommend_songs_hybrid(user, user_item_matrix, user_factors, item_factors, music_info, annoy_index, n_recommendations)
            
            # Calculate precision and recall
            true_positives = len(set(recommended_tracks) & set(true_tracks))
            precision = true_positives / len(recommended_tracks) if recommended_tracks else 0
            recall = true_positives / len(true_tracks) if true_tracks else 0
    return precision, recall        
            
        

In [None]:
def evaluate_model_hybrid(user_test_data, user_train_data, user_item_matrix, user_factors, item_factors, music_info, annoy_index, n_recommendations=5):
    precisions = []
    recalls = []

    for user, true_tracks in user_test_data.items():
        if user in user_train_data:
            recommended_tracks = recommend_songs_hybrid(user, user_item_matrix, user_factors, item_factors, music_info, annoy_index, n_recommendations)
            
            # Calculate precision and recall
            true_positives = len(set(recommended_tracks) & set(true_tracks))
            precision = true_positives / len(recommended_tracks) if recommended_tracks else 0
            recall = true_positives / len(true_tracks) if true_tracks else 0
            
            precisions.append(precision)
            recalls.append(recall)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    
    return avg_precision, avg_recall

In [None]:
user_train_data = train.groupby('user_id', observed=True)['track_id'].apply(list).to_dict()
user_test_data = test.groupby('user_id', observed=True)['track_id'].apply(list).to_dict()

In [None]:
#user_train_data["0000f88f8d76a238c251450913b0d070e4a77d19"]

In [None]:
#precision, recall = evaluate_model_hybrid_user("0000f88f8d76a238c251450913b0d070e4a77d19", user_test_data, user_train_data, user_item_sparse, user_factors, item_factors, music_info, annoy_index, n_recommendations=5)
#print(f"Average Precision: {precision}")
#print(f"Average Recall: {recall}")

In [None]:
#recommend_songs_hybrid("0000f88f8d76a238c251450913b0d070e4a77d19", user_item_sparse, user_factors, item_factors, music_info, annoy_index, n_recommendations=5)

### Evaluating the model

In [None]:
precision, recall = evaluate_model_hybrid(user_test_data, user_train_data, user_item_sparse, user_factors, item_factors, music_info, annoy_index, 11)
print(f"Average Precision: {precision}")
print(f"Average Recall: {recall}")