# Data Loading

In [None]:
import pandas as pd

new_df = pd.read_csv('database/output.csv', index_col=0)

new_df.info()

# Feature Extraction

The feature extraction of this project is unlike a regular machine learning problems. In this case, there is no "target value" which sets as a tuning parameter for the training phase.

Rather than comparing the effectiveness of the features in regards to the result (which in this case there are none), we will take the correlation between the values within of each feature space, and rank them in order, ranking them from the feature with the least variance.

With this in mind, the initial **assumption** is that user has this similarities in picking their songs inside their playlist. The features that share the least variance will be the main parameter to pick the song recommendation inside their **genre bubble**.

In [None]:
# Feature extraction using Correlation Matrix (Pearson)

import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame containing the correlation matrix
correlation_matrix = new_df.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Set plot title
plt.title('Correlation Matrix')

# Show the plot
plt.show()

In [None]:
# Song Genre Count
import re

genreDict = {}

for i in range(len(new_df)):
    _genres = new_df.iloc[i][1].replace("'", "").replace('[', '').replace(']', '')
    _genres = _genres.split(", ")

    for _genre in _genres:
        if _genre in genreDict:
            genreDict[_genre] += 1
        else:
            genreDict[_genre] = 1

print(genreDict)

In [None]:
import matplotlib.pyplot as plt

# Assuming 'data' is your dictionary

# Filter the data to include values greater than 50 and sum up the remaining values
filtered_data = {key: value for key, value in genreDict.items() if value > 30}
other_value = sum(value for value in genreDict.values() if value <= 30)

# Add the "Other" category and its summed up value to the filtered data
filtered_data['Other'] = other_value

# Extract the filtered keys and values
categories = list(filtered_data.keys())
values = list(filtered_data.values())

# Create a bar chart
plt.bar(categories, values)

# Rotate the x-axis labels by 90 degrees
plt.xticks(rotation=85)

# Add count labels on top of each bar
for i, v in enumerate(values):
    plt.text(i, v + 2, str(v), ha='center')

# Set labels and title
plt.show()
