In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df_hits = pd.read_csv("additional/hits_dataset.csv", delimiter="\t")
df_hits["hit"] = 1
df_hits

In [None]:
df_nonhits = pd.read_csv("additional/nonhits_dataset.csv", delimiter="\t")
df_nonhits["hit"] = 0
df_nonhits

In [None]:
combined_df = pd.concat([df_hits, df_nonhits], axis=0)
combined_df.replace("-", float("nan"), inplace=True)
# Drop all rows with NaN values
combined_df

In [None]:
# Check for NaN values in all columns
nan_columns = combined_df.columns[combined_df.isna().any()].tolist()

# Print columns with NaN values
print("Columns with NaN values:")
print(nan_columns)


In [None]:
from sklearn.impute import SimpleImputer

combined_df.replace("-", float("nan"), inplace=True)   ## evtl hier schon mean replacement???
# combined_df.dropna(inplace=True)   # drop nan value rows
#Fill missing values with mean for each numeric attribute
imputer = SimpleImputer(strategy='mean')
for col in nan_columns:
    combined_df[col] = imputer.fit_transform(combined_df[[col]])

combined_df

In [None]:
# Check for NaN values in all columns
nan_columns = combined_df.columns[combined_df.isna().any()].tolist()

# Print columns with NaN values
print("Columns with NaN values:")
print(nan_columns)


In [None]:
from sklearn.model_selection import train_test_split
combined_df['date'] = pd.to_datetime(combined_df['release_date'])
combined_df.sort_values(by="date", inplace=True)
border_day = combined_df["date"].iloc[-1]  - pd.DateOffset(years=6)
combined_df = combined_df[(combined_df["date"] >= border_day)]

split_day = combined_df["date"].iloc[-1]  - pd.DateOffset(years=1)
train_df = combined_df[(combined_df["date"]  < split_day)]
test_df = combined_df[(combined_df["date"]  >= split_day)]
#train_df, test_df = train_test_split(combined_df, test_size=1/6, shuffle=False)

In [None]:
combined_df

In [None]:
# Specify the columns
columns = ['artist_id', 'artist_name']

# Initialize an empty DataFrame
df_artists_train = pd.DataFrame(columns={col: [] for col in columns})

count = 0
artist_set = set()

for _, row in train_df.iterrows():
    current_artist_id_list = eval(row["id_artists"])
    current_artist_name_list = eval(row["name_artists"])
    for cur_art_id, cur_art_name in zip(current_artist_id_list, current_artist_name_list):
        if cur_art_id not in artist_set:
            artist_set.add(cur_art_id)
            df_artists_train.loc[len(df_artists_train)] = [cur_art_id, cur_art_name]
            count += 1

print("amount of artists:", count)
df_artists_train

In [None]:
import networkx as nx

all_artist_ids = artist_set.copy()

G = nx.Graph()

for id_art_cur in all_artist_ids:
    artist_name = df_artists_train[(df_artists_train["artist_id"] == id_art_cur)]["artist_name"]
    artist_info = df_artists_train[df_artists_train["artist_id"] == id_art_cur]
    if not artist_info.empty:  # Check if artist info exists
        # Filter out artists who are not successful
        node_attrs = {"name": artist_name.iloc[0]}
        G.add_node(id_art_cur, **node_attrs)
    else:
        print("Artist info not found for ID:", id_art_cur)

for _, row in train_df.iterrows():
    artist_ids = list(eval(row["id_artists"]))
    artist_names = list(eval(row["name_artists"]))
    # Filter out songs that do not have more than one artist in their execution
    if len(artist_ids) > 1:
        for i in range(len(artist_ids)):
            for j in range(i+1, len(artist_ids)):
                artist_id_1 = str(artist_ids[i])
                lable1 = str(artist_names[i])
                artist_id_2 = str(artist_ids[j])
                lable2 = str(artist_names[j])
                # Check if the edge already exists
                if G.has_edge(artist_id_1, artist_id_2):
                    # If the edge already exists, increment the weight by 1
                    G[artist_id_1][artist_id_2]['weight'] += 2
                else:
                    # Add a new edge with weight 1
                    G.add_edge(artist_id_1, artist_id_2, lable1=lable1, lable2=lable2, weight=2, song_id=str(row["song_id"]), song_name=str(row["song_name"]), explicit=bool(row["explicit"]), song_type=str(row["song_type"]), track_number=int(row["track_number"]), num_artists=int(row["num_artists"]), num_available_markets=int(row["num_available_markets"]), duration_ms=int(row["duration_ms"]), popularity=int(row["popularity"]), release_date=str(row["release_date"]), key = int(row["key"]), mode = int(row["mode"]), time_signature = int(row["time_signature"]), acousticness = float(row["acousticness"]), danceability = float(row["danceability"]), energy = float(row["energy"]), instrumentalness = float(row["instrumentalness"]), liveness = float(row["liveness"]),loudness = float(row["loudness"]), speechiness = float(row["speechiness"]), valence = float(row["valence"]), tempo = float(row["tempo"]))

In [None]:
# Assuming G is your NetworkX graph

# Extract nodes and their attributes from the graph
nodes_with_attributes = [(G.nodes[node], node) for node in G.nodes()]

# Create a DataFrame from the nodes and their attributes
df_nodes_train = pd.DataFrame(nodes_with_attributes, columns=['Attributes', "Spotify ID"])

# Normalize the 'Attributes' column to expand it into separate columns
df_attributes = pd.json_normalize(df_nodes_train['Attributes'])

# Combine the original 'Node' column with the expanded attributes
df_nodes_train = pd.concat([df_attributes, df_nodes_train['Spotify ID']], axis=1)

# Display the DataFrame
df_nodes_train

In [None]:
df_nodes_train.to_csv("network_created/nodes_real_train.csv", sep='\t', index=False)

In [None]:
# Extract edges and their attributes from the graph
edges_with_attributes = [(u, v, G.edges[u, v]) for u, v in G.edges()]

# Create a DataFrame from the edges and their attributes
df_edges_train = pd.DataFrame(edges_with_attributes, columns=['Source', 'Target', 'Attributes'])

# Normalize the 'Attributes' column to expand it into separate columns
df_edge_attributes = pd.json_normalize(df_edges_train['Attributes'])

# Combine the original 'Attributes' column with the expanded attributes
df_edges_train = pd.concat([df_edges_train[['Source', 'Target']], df_edge_attributes], axis=1)
df_edges_train

In [None]:
# Save edge DataFrame to CSV
df_edges_train.to_csv('network_created/edges_real_train.csv', sep='\t', index=False)

In [None]:
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)


In [None]:
# Specify the columns
columns = ['artist_id', 'artist_name']

# Initialize an empty DataFrame
df_artists_test = pd.DataFrame(columns={col: [] for col in columns})

count = 0
artist_set = set()

for _, row in test_df.iterrows():
    current_artist_id_list = eval(row["id_artists"])
    current_artist_name_list = eval(row["name_artists"])
    for cur_art_id, cur_art_name in zip(current_artist_id_list, current_artist_name_list):
        if cur_art_id not in artist_set:
            artist_set.add(cur_art_id)
            df_artists_test.loc[len(df_artists_test)] = [cur_art_id, cur_art_name]
            count += 1

print("amount of artists:", count)
df_artists_test

In [None]:
import networkx as nx

all_artist_ids = artist_set.copy()

G2 = nx.Graph()

for id_art_cur in all_artist_ids:
    artist_name = df_artists_test[(df_artists_test["artist_id"] == id_art_cur)]["artist_name"]
    artist_info = df_artists_test[df_artists_test["artist_id"] == id_art_cur]
    if not artist_info.empty:  # Check if artist info exists
        # Filter out artists who are not successful
        node_attrs = {"name": artist_name.iloc[0]}
        G2.add_node(id_art_cur, **node_attrs)
    else:
        print("Artist info not found for ID:", id_art_cur)

for _, row in test_df.iterrows():
    artist_ids = list(eval(row["id_artists"]))
    artist_names = list(eval(row["name_artists"]))
    # Filter out songs that do not have more than one artist in their execution
    if len(artist_ids) > 1:
        for i in range(len(artist_ids)):
            for j in range(i+1, len(artist_ids)):
                artist_id_1 = str(artist_ids[i])
                lable1 = str(artist_names[i])
                artist_id_2 = str(artist_ids[j])
                lable2 = str(artist_names[j])
                # Check if the edge already exists
                if G2.has_edge(artist_id_1, artist_id_2):
                    # If the edge already exists, increment the weight by 1
                    G2[artist_id_1][artist_id_2]['weight'] += 2
                else:
                    # Add a new edge with weight 1
                    G2.add_edge(artist_id_1, artist_id_2, lable1=lable1, lable2=lable2, weight=2, song_id=str(row["song_id"]), song_name=str(row["song_name"]), explicit=bool(row["explicit"]), song_type=str(row["song_type"]), track_number=int(row["track_number"]), num_artists=int(row["num_artists"]), num_available_markets=int(row["num_available_markets"]), duration_ms=int(row["duration_ms"]), popularity=int(row["popularity"]), release_date=str(row["release_date"]), key = int(row["key"]), mode = int(row["mode"]), time_signature = int(row["time_signature"]), acousticness = float(row["acousticness"]), danceability = float(row["danceability"]), energy = float(row["energy"]), instrumentalness = float(row["instrumentalness"]), liveness = float(row["liveness"]),loudness = float(row["loudness"]), speechiness = float(row["speechiness"]), valence = float(row["valence"]), tempo = float(row["tempo"]))

In [None]:
# Assuming G is your NetworkX graph

# Extract nodes and their attributes from the graph
nodes_with_attributes = [(G2.nodes[node], node) for node in G2.nodes()]

# Create a DataFrame from the nodes and their attributes
df_nodes_test = pd.DataFrame(nodes_with_attributes, columns=['Attributes', "Spotify ID"])

# Normalize the 'Attributes' column to expand it into separate columns
df_attributes = pd.json_normalize(df_nodes_test['Attributes'])

# Combine the original 'Node' column with the expanded attributes
df_nodes_test = pd.concat([df_attributes, df_nodes_test['Spotify ID']], axis=1)

# Display the DataFrame
df_nodes_test

In [None]:
df_nodes_test.to_csv("network_created/nodes_real_test.csv", sep='\t', index=False)

In [None]:
# Extract edges and their attributes from the graph
edges_with_attributes = [(u, v, G2.edges[u, v]) for u, v in G2.edges()]

# Create a DataFrame from the edges and their attributes
df_edges_test = pd.DataFrame(edges_with_attributes, columns=['Source', 'Target', 'Attributes'])

# Normalize the 'Attributes' column to expand it into separate columns
df_edge_attributes = pd.json_normalize(df_edges_test['Attributes'])

# Combine the original 'Attributes' column with the expanded attributes
df_edges_test = pd.concat([df_edges_test[['Source', 'Target']], df_edge_attributes], axis=1)
df_edges_test

In [None]:
# Save edge DataFrame to CSV
df_edges_test.to_csv('network_created/edges_real_test.csv', sep='\t', index=False)

In [None]:
num_nodes = G2.number_of_nodes()
num_edges = G2.number_of_edges()
print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)


In [None]:
train_df

In [None]:
test_df