In [60]:
import pandas as pd
import numpy as np

data = pd.read_csv("data_superstar_v1_0.csv", delimiter=",")
data['date'] = pd.to_datetime(data['release_date'])
data.sort_values(by="date", inplace=True)

split_day = data["date"].iloc[-1] - pd.DateOffset(years=1)
data_train = data[(data["date"] < split_day)].copy()

In [None]:
import matplotlib.pyplot as plt

# Number of columns in the DataFrame
num_columns = len(data.columns)

# Create subplots
fig, axes = plt.subplots(nrows=num_columns, ncols=1, figsize=(8, 4*num_columns))

# Plot bar plots for each column in subplots
for i, column in enumerate(data.columns):
    ax = axes[i] if num_columns > 1 else axes
    value_counts = data[column].value_counts().sort_index()
    value_counts.plot(kind='bar', ax=ax)
    ax.set_title(f'Distribution of column {column}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Number of Occurrences')
    ax.grid(True)

# Adjust layout
plt.tight_layout()
plt.show()          #this and boxplot to check which sup var is best

In [None]:
import matplotlib.pyplot as plt

superstar_hits = data[(data['superstar_x'] == 1)]['hit'].sum()
non_superstar_hits = data[data['superstar_x'] == 0]['hit'].sum()

# Prepare data for the bar chart
to_plot = [superstar_hits, non_superstar_hits]
categories = ['Superstar', 'Non-Superstar']

# Create the bar chart
plt.figure(figsize=(8, 6))  # Adjust figure size as desired
plt.bar(categories, to_plot, color=['gold', 'lightskyblue'])
plt.xlabel('Superstar Status')
plt.ylabel('Number of Hits')
plt.title('Hits by Superstar Status')
plt.xticks(rotation=0)  # Rotate x-axis labels for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid lines

# Display the chart
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Filter data for superstars and non-superstars
superstars = data[data['superstar_x'] == 1]
non_superstars = data[data['superstar_x'] == 0]

# Count hits and non-hits
superstar_hits = superstars['hit'].sum()
superstar_non_hits = len(superstars) - superstar_hits  # Assuming all rows have a value in 'hit'

non_superstar_hits = non_superstars['hit'].sum()
non_superstar_non_hits = len(non_superstars) - non_superstar_hits

# Prepare data for the bar chart
data = [superstar_hits, superstar_non_hits, non_superstar_hits, non_superstar_non_hits]
categories = ['Superstar Hits', 'Superstar Non-Hits', 'Non-Superstar Hits', 'Non-Superstar Non-Hits']

# Create the bar chart with adjusted width for better visibility
plt.figure(figsize=(10, 6))  # Adjust figure size as desired
plt.bar(categories, data, color=['gold', 'lightcoral', 'lightskyblue', 'lightblue'], width=0.4)
plt.xlabel('Superstar Status & Hits')
plt.ylabel('Number of Songs')
plt.title('Hits and Non-Hits by Superstar Status')
plt.xticks(rotation=15)  # Rotate x-axis labels slightly for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid lines

# Display the chart
plt.tight_layout()
plt.show()


In [None]:
# Filter data for superstars and non-superstars
#superstars = data[data['superstar_x'] == 1]
#non_superstars = data[data['superstar_x'] == 0]

#print(superstars)

# Count hits and non-hits
superstar_hits = superstars['hit'].sum()
superstar_non_hits = len(superstars) - superstar_hits  # Assuming all rows have a value in 'hit'

non_superstar_hits = non_superstars['hit'].sum()
non_superstar_non_hits = len(non_superstars) - non_superstar_hits

print(superstar_hits)

# Calculate total songs for each category
total_superstar_songs = len(superstars)
total_non_star_songs = len(non_superstars)

# Prepare data for the pie chart (percentages)
superstar_hit_pct = (superstar_hits / total_superstar_songs) * 100
superstar_non_hit_pct = (superstar_non_hits / total_superstar_songs) * 100
non_star_hit_pct = (non_superstar_hits / total_non_star_songs) * 100
non_star_non_hit_pct = (non_superstar_non_hits / total_non_star_songs) * 100

print(superstar_hit_pct)

# Combine data into lists for the pie chart
labels = ['Superstar Hits', 'Superstar Non-Hits', 'Non-Superstar Hits', 'Non-Superstar Non-Hits']
sizes = [superstar_hit_pct, superstar_non_hit_pct, non_star_hit_pct, non_star_non_hit_pct]

# Create the pie chart
plt.figure(figsize=(8, 8))  # Adjust figure size as desired
plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)  # Customize options
plt.title('Distribution of Hits and Non-Hits by Superstar Status (Percentages)')
plt.axis('equal')  # Equal aspect ratio for a circular pie chart

# Display the chart
plt.tight_layout()
plt.show()

In [None]:
# Count hits and non-hits
superstar_hits = superstars['hit'].sum()
superstar_non_hits = len(superstars) - superstar_hits  # Assuming all rows have a value in 'hit'

non_superstar_hits = non_superstars['hit'].sum()
non_superstar_non_hits = len(non_superstars) - non_superstar_hits

# Prepare data for superstar pie chart
superstar_labels = ['Superstar Hits', 'Superstar Non-Hits']
superstar_sizes = [superstar_hits, superstar_non_hits]

# Prepare data for non-superstar pie chart
non_star_labels = ['Non-Superstar Hits', 'Non-Superstar Non-Hits']
non_star_sizes = [non_superstar_hits, non_superstar_non_hits]

startangle = 140

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))  # Adjust figure size as desired

# Create the pie chart for superstars
ax1.pie(superstar_sizes, labels=superstar_labels, autopct="%1.1f%%", startangle=140)
ax1.set_title('Superstar Hits Distribution (Percentages)')
ax1.axis('equal')  # Equal aspect ratio for a circular pie chart

# Create the pie chart for non-superstars
ax2.pie(non_star_sizes, labels=non_star_labels, autopct="%1.1f%%", startangle=140)
ax2.set_title('Non-Superstar Hits Distribution (Percentages)')
ax2.axis('equal')  # Equal aspect ratio for a circular pie chart



# Display the chart
plt.tight_layout()
plt.show()

In [None]:
columns = ["hit", 'explicit', 'num_available_markets',
            'instrumentalness', 'loudness', 'years_on_charts', "superstar_x"] 
            #"hits_in_past_x", "success_rate_x", "superstar_x", "superstar_v5_x","superstar_v4_x","superstar_v3_x","superstar_v2_x","superstar_v1_x"] 
             #"hits_in_past_y", "success_rate_y", "superstar_y", "superstar_v5_y","superstar_v4_y","superstar_v3_y","superstar_v2_y","superstar_v1_y"]#"pagerank_x", "pagerank_y", "success_rate_x", "success_rate_y", "hits_in_past_x", "hits_in_past_y"]#, "superstar_v5_x","superstar_v4_x","superstar_v3_x","superstar_v2_x","superstar_v1_x", "superstar_v5_y"]
#columns = ["hit", "pagerank_x", "pagerank_y"]#"degree_x", "weighted degree_x", "eccentricity_x", "closnesscentrality_x", "clustering_x", "betweenesscentrality_x", "eigencentrality_x","degree_y", "weighted degree_y", "eccentricity_y", "closnesscentrality_y", "clustering_y", "betweenesscentrality_y", "eigencentrality_y"]

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Assume `data` is your DataFrame and `columns` is the list of columns you want to consider
correlation_matrix = data_train[columns].corr()

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
identity_matrix = np.identity(mask.shape[0])
real_mask = mask-identity_matrix
real_mask = real_mask.astype(dtype=bool)

# Plot the correlation matrix with the mask
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, mask=real_mask, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Pearson Correlation Matrix (Lower Left Half)')
plt.show()

In [None]:
correlation_matrix = data_train[columns].corr(method="spearman")

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
identity_matrix = np.identity(mask.shape[0])
real_mask = mask-identity_matrix
real_mask = real_mask.astype(dtype=bool)

# Plot the correlation matrix with the mask
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, mask=real_mask, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Spearman Correlation Matrix (Lower Left Half)')
plt.show()

why is pagerank_x negatively correlated???
maybe because less successfullartists with hit artist coop?

In [None]:
df_one_hot = pd.get_dummies(data_train, columns=['Cluster_x', "Cluster_y"], drop_first=True)
df_one_hot

In [None]:
one_hot_columns = [col for col in df_one_hot.columns if col not in data_train.columns]

In [None]:
#columns.extend(one_hot_columns)
columns = ["hit"]
columns.extend(one_hot_columns)

In [None]:
# Assume `data` is your DataFrame and `columns` is the list of columns you want to consider
correlation_matrix = df_one_hot[columns].corr()

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
identity_matrix = np.identity(mask.shape[0])
real_mask = mask-identity_matrix
real_mask = real_mask.astype(dtype=bool)

# Plot the correlation matrix with the mask
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, mask=real_mask, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Pearson Correlation Matrix (Lower Left Half)')
plt.show()

In [None]:
correlation_matrix = df_one_hot[columns].corr(method="spearman")

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
identity_matrix = np.identity(mask.shape[0])
real_mask = mask-identity_matrix
real_mask = real_mask.astype(dtype=bool)

# Plot the correlation matrix with the mask
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, mask=real_mask, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Spearman Correlation Matrix (Lower Left Half)')
plt.show()

plot amount hit songs and successrate of artist by clusters and pagerank as well as superstar var.

## explain on Example of Taylor swift

In [None]:
swift_id = '06HL4z0CvFAxyc27GXpf02'

In [None]:
swift_songs = data_train[(data_train["artist1_id"] == swift_id) | (data_train["artist2_id"] == swift_id)]
swift_songs

Taylor Swift's is part of Cluster 1 

In [None]:
swift_songs["num_artists"].value_counts()

In [None]:
swift_songs["hit"].value_counts()

alan walker

In [None]:
walker_id = '7vk5e3vY1uw9plTHJAMwjN'
walker_songs = data_train[(data_train["artist1_id"] == walker_id) | (data_train["artist2_id"] == walker_id)]
walker_songs

In [None]:
walker_songs["num_artists"].value_counts()

In [None]:
walker_songs["hit"].value_counts()

get one of the most successfull artists:


In [None]:
data_train.sort_values(by="hits_in_past_x", ascending=False)

since gleecast is not really one artist we chose to use drake instead:

In [None]:
drake_id = '3TVXtAsR1Inumwj472S9r4'
drake_songs = data_train[(data_train["artist1_id"] == drake_id) | (data_train["artist2_id"] == drake_id)]
drake_songs

In [None]:
drake_songs["num_artists"].value_counts()

almost half of the songs were collaborations

In [None]:
drake_songs["hit"].value_counts()

In [None]:
drake_hits = drake_songs[drake_songs["hit"] == 1.0]
drake_hits

In [None]:
drake_hits["num_artists"].value_counts()

of these x songs y are collabs

In [None]:
drake_collab = drake_songs[drake_songs["num_artists"] >= 2.0]
drake_collab

In [None]:
drake_collab["hit"].value_counts()

% of hit songs are collab..

get better Artist find those with biggest pagerank