In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("../data/top_artists_genres.csv", encoding="utf-8", header=None)
df.columns = ["Country Code", "Top artists", "Top genres"]
df.head()

Unnamed: 0,Country Code,Top artists,Top genres
0,at,"Jazeek, Bruno Mars, Billie Eilish, Linkin Park...","german hip hop, afro house, german pop, nu met..."
1,be,"Bruno Mars, Billie Eilish, GIMS, Gracie Abrams...","pop urbaine, french pop, afro house, chanson, ..."
2,dk,"Annika, Lamin, Gilli, Benny Jamz, Tobias Rahim...","dansktop, norwegian rap, west coast hip hop, n..."
3,fr,"GIMS, SDM, Bruno Mars, Stromae, Pomme, Arcane,...","pop urbaine, french pop, chanson, zouk, variét..."
4,de,"Jazeek, Luciano, Bruno Mars, Linkin Park, LACA...","german hip hop, german pop, afro house, nu met..."


In [4]:
# Convert artists and genres into sets for easy comparison
df["Top Artists"] = df.iloc[:, 1].apply(lambda x: set(str(x).split(", ")[:30]))
df["Top Genres"] = df.iloc[:, 2].apply(lambda x: set(str(x).split(", ")))

# Extract country codes
countries = df.iloc[:, 0].values
num_countries = len(countries)

artists_similarity = np.zeros((num_countries, num_countries), dtype=int)
genres_similarity = np.zeros((num_countries, num_countries), dtype=int)

# Compute similarities
for i in range(num_countries):
    for j in range(i, num_countries):  # Only compute upper triangle (symmetry)
        if i == j:
            artists_similarity[i, j] = len(df["Top Artists"].iloc[i])
            genres_similarity[i, j] = len(df["Top Genres"].iloc[i])
        else:
            artists_similarity[i, j] = artists_similarity[j, i] = len(df["Top Artists"].iloc[i] & df["Top Artists"].iloc[j])
            genres_similarity[i, j] = genres_similarity[j, i] = len(df["Top Genres"].iloc[i] & df["Top Genres"].iloc[j])


# Saving
artists_similarity_df = pd.DataFrame(artists_similarity, index=countries, columns=countries)
genres_similarity_df = pd.DataFrame(genres_similarity, index=countries, columns=countries)

artists_similarity_df.to_csv("../data/artists_similarity.csv", encoding="utf-8")
genres_similarity_df.to_csv("../data/genres_similarity.csv", encoding="utf-8")
print("Similarity matrices saved.")

Similarity matrices saved.


In [5]:
# Print shared artists and genres between country 0 and 1 for verification
country_0 = countries[0]
country_1 = countries[1]

shared_artists = df["Top Artists"].iloc[0] & df["Top Artists"].iloc[1]
shared_genres = df["Top Genres"].iloc[0] & df["Top Genres"].iloc[1]

print(f"Shared artists between {country_0} and {country_1}:")
print(shared_artists)

print(f"\nShared genres between {country_0} and {country_1}:")
print(shared_genres)



Shared artists between at and be:
{'Gigi Perez', 'HUGEL', 'ROSÉ', 'David Guetta', 'Billie Eilish', 'Lady Gaga', 'The Weeknd', 'Linkin Park', 'Myles Smith', 'Chappell Roan', 'Bruno Mars', 'Gracie Abrams'}

Shared genres between at and be:
{'edm', 'tropical house', 'melodic house', 'pop', 'art pop', 'tech house', 'afro house', 'rap metal', 'alternative metal', 'k-pop', 'dance', 'soft pop', 'brazilian pop', 'funk pop', 'house', 'rage rap', 'rock', 'nu metal', 'latin house'}


In [7]:
normalized_artists_similarity = np.zeros((num_countries, num_countries))
normalized_genres_similarity = np.zeros((num_countries, num_countries))

for i in range(num_countries):
    for j in range(i, num_countries):  # Only compute upper triangle (symmetry)
        if i == j:
            normalized_artists_similarity[i, j] = 1  # Full similarity with itself
            normalized_genres_similarity[i, j] = 1
        else:
            # Compute Jaccard similarity (intersection / union) for normalization
            total_artists = len(df["Top Artists"].iloc[i] | df["Top Artists"].iloc[j])
            total_genres = len(df["Top Genres"].iloc[i] | df["Top Genres"].iloc[j])

            normalized_artists_similarity[i, j] = normalized_artists_similarity[j, i] = (
                artists_similarity[i, j] / total_artists if total_artists > 0 else 0
            )
            normalized_genres_similarity[i, j] = normalized_genres_similarity[j, i] = (
                genres_similarity[i, j] / total_genres if total_genres > 0 else 0
            )

# Saving
normalized_artists_similarity_df = pd.DataFrame(normalized_artists_similarity, index=countries, columns=countries)
normalized_genres_similarity_df = pd.DataFrame(normalized_genres_similarity, index=countries, columns=countries)

normalized_artists_similarity_df.to_csv("../data/normalized_artists_similarity.csv", encoding="utf-8")
normalized_genres_similarity_df.to_csv("../data/normalized_genres_similarity.csv", encoding="utf-8")