Note: Each section can be run independently of others.

# Imports, Installs, Downloads

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import clear_output

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/aditiroy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aditiroy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aditiroy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Emotion Classification

In [17]:
def load_nrc_lexicon():
    emotion_lexicon = defaultdict(list)
    lexicon_file = "NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

    with open(lexicon_file, 'r') as file:
        for line in file:
            word, emotion, association = line.strip().split('\t')
            if int(association) == 1:
                emotion_lexicon[word].append(emotion)
    return emotion_lexicon

def preprocess_lyrics(lyrics):
    punctuations = '\'"\\,<>./?@#$%^&*_~/!()-[]{};:'
    # Remove punctuation and any content within brackets (e.g., [chorus])
    lyrics = ''.join([char for char in lyrics if char not in punctuations])
    lyrics = lyrics.split('[')[0]  # Remove anything between [ and ]

    tokens = nltk.word_tokenize(lyrics.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

def normalize_distribution(distribution):
    total = sum(distribution.values())
    if total > 0:
        return {emotion: value / total for emotion, value in distribution.items()}
    else:
        return {emotion: 0 for emotion in distribution.keys()}  # Return zero for all if total is zero

def assign_emotion_to_song(lyrics, emotion_lexicon, tfidf_vector, tfidf_words):
    emotion_count = defaultdict(int)
    sentiment_count = defaultdict(int)
    tokens = preprocess_lyrics(lyrics)

    # For each token, check if it's in the emotion lexicon
    for word in tokens:
        if word in emotion_lexicon:
            # Use np.where to get the index of the word in tfidf_words
            word_index_array = np.where(tfidf_words == word)[0]
            if len(word_index_array) > 0:
                word_index = word_index_array[0]
                word_tfidf_score = tfidf_vector[word_index]
                for emotion in emotion_lexicon[word]:
                    if emotion in ['positive', 'negative']:
                        sentiment_count[emotion] += word_tfidf_score
                    else:
                        emotion_count[emotion] += word_tfidf_score

    # Normalize distributions
    normalized_emotion_count = normalize_distribution(emotion_count)
    normalized_sentiment_count = normalize_distribution(sentiment_count)

    # Determine dominant emotion and dominant sentiment
    dominant_emotion = max(normalized_emotion_count, key=normalized_emotion_count.get) if normalized_emotion_count else None
    dominant_sentiment = max(normalized_sentiment_count, key=normalized_sentiment_count.get) if normalized_sentiment_count else None

    return dominant_emotion, dominant_sentiment, normalized_emotion_count, normalized_sentiment_count

def assign_emotions_to_dataset(df, emotion_lexicon, tfidf_scores, tfidf_words):
    emotion_results = []

    # Loop over the songs in the dataset
    for index, row in df.iterrows():
        song_id = row['id']
        artist = row['artist']
        title = row['title']
        lyrics = row['lyrics']

        # Get the corresponding TF-IDF vector for the song
        if index < len(tfidf_scores):
            tfidf_vector = tfidf_scores[index]
        else:
            continue  # Skip if the index is out of bounds

        dominant_emotion, dominant_sentiment, emotion_count, sentiment_count = assign_emotion_to_song(lyrics, emotion_lexicon, tfidf_vector, tfidf_words)

        emotion_results.append({
            'song_id': song_id,
            'artist': artist,
            'title': title,
            'dominant_emotion': dominant_emotion,
            'dominant_sentiment': dominant_sentiment,
            'emotion_distribution': emotion_count,
            'sentiment_distribution': sentiment_count
        })

    return pd.DataFrame(emotion_results)

In [None]:
# df = pd.read_csv('song_lyrics.csv')
# df = df[df['language'] == 'en']
# df = df.dropna()
# sample_df = df.sample(n = 1000)
# sample_df.to_csv('sample_df.csv', index=False)


# Load the dataset
df = pd.read_csv('song_lyrics.csv')
df_songs = df[df['language'] == 'en']
df_songs = df_songs[(df_songs['year'] >= 1950) & (df_songs['year'] <= 2025) & (df_songs['views'] >= 100)]
df_songs = df_songs.dropna()

# Load NRC emotion lexicon
emotion_lexicon = load_nrc_lexicon()

# Preprocess the lyrics and create a list of all lyrics
lyrics_list = df_songs['lyrics'].apply(lambda x: ' '.join(preprocess_lyrics(x)))

# Compute TF-IDF for the lyrics
vectorizer = TfidfVectorizer(max_features=10000)  # Limit to top 10,000 words
tfidf_scores = vectorizer.fit_transform(lyrics_list).toarray()
tfidf_words = vectorizer.get_feature_names_out()  # List of words

# Assign emotions to all songs
emotion_results_df = assign_emotions_to_dataset(df_songs, emotion_lexicon, tfidf_scores, tfidf_words)
emotion_results_df = emotion_results_df.dropna()

# Save the results to a CSV file
emotion_results_df.to_csv('emotion_assigned_songs.csv', index=False)

# Emotion Classification Evaluation

In [6]:
# get all survey data and format
survey_data = {
    'song_id' : [6636486] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.15, 0.10, 0.20, 0.05, 0.10, 0.05, 0.20, 0.15, 0.05, 0.10],
    'Joy': [0.10, 0.05, 0.05, 0.05, 0.15, 0.10, 0.05, 0.05, 0.10, 0.10],
    'Surprise': [0.05, 0.05, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05, 0.10, 0.10],
    'Anger': [0.30, 0.40, 0.25, 0.35, 0.25, 0.30, 0.25, 0.35, 0.30, 0.30],
    'Disgust': [0.10, 0.05, 0.10, 0.15, 0.05, 0.10, 0.10, 0.05, 0.05, 0.05],
    'Fear': [0.15, 0.10, 0.15, 0.10, 0.20, 0.15, 0.10, 0.10, 0.10, 0.10],
    'Sadness': [0.10, 0.20, 0.10, 0.10, 0.10, 0.15, 0.15, 0.15, 0.20, 0.15],
    'Trust': [0.05, 0.05, 0.10, 0.10, 0.10, 0.10, 0.10, 0.10, 0.10, 0.10]
}
survey_1 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [5955393] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.10, 0.10, 0.15, 0.05, 0.10, 0.05, 0.20, 0.05, 0.10, 0.15],
    'Joy': [0.05, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05, 0.10, 0.05, 0.05],
    'Surprise': [0.05, 0.05, 0.10, 0.05, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05],
    'Anger': [0.25, 0.25, 0.30, 0.35, 0.30, 0.35, 0.25, 0.30, 0.30, 0.30],
    'Disgust': [0.10, 0.05, 0.10, 0.15, 0.05, 0.10, 0.10, 0.05, 0.05, 0.05],
    'Fear': [0.20, 0.15, 0.15, 0.20, 0.25, 0.20, 0.20, 0.15, 0.20, 0.25],
    'Sadness': [0.20, 0.30, 0.25, 0.15, 0.20, 0.15, 0.25, 0.25, 0.25, 0.20],
    'Trust': [0.05, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
}
survey_2 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [4191823] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.25, 0.30, 0.35, 0.30, 0.25, 0.20, 0.30, 0.35, 0.30, 0.25],
    'Joy': [0.15, 0.10, 0.15, 0.10, 0.10, 0.15, 0.10, 0.10, 0.15, 0.10],
    'Surprise': [0.20, 0.15, 0.20, 0.20, 0.15, 0.20, 0.25, 0.20, 0.15, 0.20],
    'Anger': [0.30, 0.35, 0.40, 0.35, 0.40, 0.35, 0.30, 0.40, 0.35, 0.30],
    'Disgust': [0.05, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],
    'Fear': [0.15, 0.20, 0.15, 0.20, 0.15, 0.15, 0.20, 0.15, 0.20, 0.15],
    'Sadness': [0.05, 0.10, 0.05, 0.05, 0.10, 0.05, 0.05, 0.10, 0.05, 0.05],
    'Trust': [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
}
survey_3 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [1062758] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.45, 0.40, 0.50, 0.55, 0.50, 0.45, 0.40, 0.50, 0.45, 0.50],
    'Joy': [0.35, 0.40, 0.35, 0.30, 0.35, 0.40, 0.40, 0.35, 0.35, 0.30],
    'Surprise': [0.05, 0.10, 0.05, 0.05, 0.05, 0.10, 0.05, 0.05, 0.10, 0.05],
    'Anger': [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],
    'Disgust': [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],
    'Fear': [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],
    'Sadness': [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],
    'Trust': [0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25]
}
survey_4 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [7402191] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.55, 0.60, 0.50, 0.45, 0.50, 0.55, 0.60, 0.50, 0.45, 0.50],
    'Joy': [0.25, 0.20, 0.25, 0.30, 0.25, 0.20, 0.20, 0.25, 0.30, 0.25],
    'Surprise': [0.15, 0.10, 0.15, 0.15, 0.10, 0.15, 0.10, 0.15, 0.15, 0.10],
    'Anger': [0.60, 0.55, 0.60, 0.65, 0.60, 0.55, 0.55, 0.60, 0.65, 0.60],
    'Disgust': [0.50, 0.45, 0.50, 0.55, 0.50, 0.45, 0.45, 0.50, 0.55, 0.50],
    'Fear': [0.35, 0.30, 0.35, 0.40, 0.35, 0.30, 0.30, 0.35, 0.40, 0.35],
    'Sadness': [0.30, 0.25, 0.30, 0.35, 0.30, 0.25, 0.25, 0.30, 0.35, 0.30],
    'Trust': [0.20, 0.15, 0.20, 0.25, 0.20, 0.15, 0.15, 0.20, 0.25, 0.20]
}
survey_5 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [6047243] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.25, 0.20, 0.15, 0.25, 0.20, 0.30, 0.20, 0.25, 0.30, 0.20],
    'Joy': [0.10, 0.05, 0.15, 0.10, 0.05, 0.10, 0.05, 0.10, 0.05, 0.05],
    'Surprise': [0.05, 0.10, 0.05, 0.05, 0.10, 0.05, 0.05, 0.10, 0.05, 0.05],
    'Anger': [0.50, 0.55, 0.50, 0.55, 0.60, 0.55, 0.50, 0.55, 0.60, 0.55],
    'Disgust': [0.35, 0.40, 0.35, 0.40, 0.45, 0.40, 0.35, 0.40, 0.45, 0.40],
    'Fear': [0.60, 0.55, 0.60, 0.65, 0.60, 0.65, 0.60, 0.65, 0.60, 0.65],
    'Sadness': [0.70, 0.75, 0.70, 0.75, 0.80, 0.75, 0.70, 0.75, 0.80, 0.75],
    'Trust': [0.15, 0.10, 0.15, 0.10, 0.05, 0.10, 0.15, 0.10, 0.05, 0.10]
}
survey_6 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [4313071] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.25, 0.20, 0.30, 0.20, 0.25, 0.30, 0.25, 0.20, 0.15, 0.25],
    'Joy': [0.20, 0.30, 0.25, 0.30, 0.25, 0.20, 0.15, 0.20, 0.30, 0.25],
    'Surprise': [0.10, 0.10, 0.05, 0.10, 0.10, 0.10, 0.05, 0.10, 0.10, 0.05],
    'Anger': [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.10, 0.05, 0.05, 0.05],
    'Disgust': [0.05, 0.05, 0.05, 0.05, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05],
    'Fear': [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.10, 0.05, 0.05, 0.05],
    'Sadness': [0.20, 0.20, 0.20, 0.15, 0.20, 0.15, 0.15, 0.20, 0.15, 0.15],
    'Trust': [0.10, 0.05, 0.05, 0.05, 0.10, 0.05, 0.10, 0.05, 0.10, 0.10]
}
survey_7 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [6850734] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.20, 0.25, 0.20, 0.30, 0.25, 0.20, 0.15, 0.25, 0.20, 0.20],
    'Joy': [0.15, 0.10, 0.15, 0.20, 0.10, 0.15, 0.20, 0.15, 0.10, 0.15],
    'Surprise': [0.10, 0.15, 0.10, 0.05, 0.15, 0.10, 0.10, 0.10, 0.15, 0.10],
    'Anger': [0.05, 0.05, 0.05, 0.10, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05],
    'Disgust': [0.05, 0.10, 0.05, 0.05, 0.05, 0.05, 0.10, 0.05, 0.10, 0.05],
    'Fear': [0.05, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.10],
    'Sadness': [0.20, 0.15, 0.20, 0.15, 0.20, 0.15, 0.20, 0.20, 0.20, 0.15],
    'Trust': [0.20, 0.15, 0.15, 0.10, 0.15, 0.15, 0.15, 0.15, 0.15, 0.20]
}
survey_8 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [6736901] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],
    'Joy': [0.01, 0.00, 0.00, 0.01, 0.00, 0.01, 0.00, 0.00, 0.01, 0.00],
    'Surprise': [0.05, 0.10, 0.05, 0.00, 0.00, 0.05, 0.05, 0.10, 0.05, 0.05],
    'Anger': [0.10, 0.15, 0.20, 0.15, 0.10, 0.10, 0.15, 0.15, 0.10, 0.15],
    'Disgust': [0.20, 0.25, 0.20, 0.30, 0.20, 0.25, 0.20, 0.20, 0.25, 0.20],
    'Fear': [0.25, 0.20, 0.25, 0.25, 0.30, 0.30, 0.25, 0.20, 0.25, 0.25],
    'Sadness': [0.30, 0.25, 0.25, 0.25, 0.30, 0.25, 0.30, 0.25, 0.25, 0.25],
    'Trust': [0.04, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.05, 0.00, 0.00]
}
survey_9 = pd.DataFrame(survey_data)

survey_data = {
    'song_id': [6184434] * 10,
    'Participant': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Anticipation': [0.10, 0.15, 0.10, 0.10, 0.15, 0.10, 0.15, 0.10, 0.15, 0.10],
    'Joy': [0.05, 0.00, 0.05, 0.00, 0.00, 0.05, 0.00, 0.05, 0.00, 0.05],
    'Surprise': [0.10, 0.05, 0.05, 0.10, 0.10, 0.05, 0.05, 0.10, 0.05, 0.05],
    'Anger': [0.20, 0.25, 0.30, 0.20, 0.25, 0.20, 0.25, 0.25, 0.20, 0.20],
    'Disgust': [0.05, 0.05, 0.05, 0.05, 0.05, 0.10, 0.05, 0.05, 0.05, 0.05],
    'Fear': [0.25, 0.20, 0.25, 0.30, 0.25, 0.25, 0.20, 0.20, 0.25, 0.25],
    'Sadness': [0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25],
    'Trust': [0.00, 0.05, 0.00, 0.10, 0.10, 0.05, 0.05, 0.05, 0.10, 0.05]
}
survey_10 = pd.DataFrame(survey_data)

survey_df = pd.concat([survey_1, survey_2, survey_3, survey_4, survey_5, survey_6, survey_7, survey_8, survey_9, survey_10], ignore_index=True)

aggregated_df = survey_df.drop('Participant', axis=1).groupby('song_id').mean().reset_index()

# Normalize the distributions so that they sum to 1
emotions = ['Anticipation', 'Joy', 'Surprise', 'Anger', 'Disgust', 'Fear', 'Sadness', 'Trust']
aggregated_df[emotions] = aggregated_df[emotions].div(aggregated_df[emotions].sum(axis=1), axis=0)

survey_df = aggregated_df.sort_values(by='song_id').reset_index()

In [7]:
# get all emotion data and format
list_of_song_ids = survey_df['song_id'].unique().tolist()

emotion_df = pd.read_csv('old_emotion_assigned_songs.csv')
emotion_df = emotion_df[emotion_df['song_id'].isin(list_of_song_ids)].reset_index()

def convert_defaultdict_string(s):
    dict_string = s.replace("defaultdict(<class 'int'>, ", "").strip("()")
    return ast.literal_eval(dict_string)

emotion_df['emotion_distribution'] = emotion_df['emotion_distribution'].apply(convert_defaultdict_string)

emotions = ['anticipation', 'joy', 'surprise', 'anger', 'disgust', 'fear', 'sadness', 'trust']

result_df = pd.DataFrame()
result_df['song_id'] = emotion_df['song_id']
for emotion in emotions:
    result_df[emotion.capitalize()] = emotion_df['emotion_distribution'].apply(lambda x: x.get(emotion, 0))

emotion_df = result_df.sort_values(by='song_id').reset_index()

In [8]:
#calculate KL Divergence
def kl_divergence(p, q):
    p = np.array(p)
    q = np.array(q)
    p = p + 1e-10
    q = q + 1e-10
    return np.sum(p * np.log(p / q))

results = []
for _, row in survey_df.iterrows():
    song_id = row['song_id']
    true_dist = row[1:].values
    predicted_dist = emotion_df[emotion_df['song_id'] == song_id].iloc[0, 1:].values
    kl_div = kl_divergence(true_dist, predicted_dist)
    results.append({'song_id': song_id, 'KL_Divergence': kl_div})

kl_results_df = pd.DataFrame(results)

kl_results_df

Unnamed: 0,song_id,KL_Divergence
0,1062758.0,0.946462
1,4191823.0,0.155166
2,4313071.0,0.422628
3,5955393.0,0.292585
4,6047243.0,0.492483
5,6184434.0,0.228473
6,6636486.0,0.188995
7,6736901.0,0.379533
8,6850734.0,0.936562
9,7402191.0,0.327137


In [9]:
#calculate Cosine Similarity
emotion_features = emotion_df.drop(columns='song_id')
survey_features = survey_df.drop(columns='song_id')

similarity_matrix = cosine_similarity(emotion_features, survey_features)

similarity_df = pd.DataFrame(similarity_matrix, index=emotion_df['song_id'], columns=survey_df['song_id'])

self_similarity = similarity_df.values.diagonal()

self_similarity_df = pd.DataFrame({
    'song_id': emotion_df['song_id'],
    'self_similarity': self_similarity
})

self_similarity_df

Unnamed: 0,song_id,self_similarity
0,1062758,0.126628
1,4191823,0.969375
2,4313071,0.986858
3,5955393,0.963733
4,6047243,0.996586
5,6184434,0.998435
6,6636486,0.054223
7,6736901,0.999222
8,6850734,0.998101
9,7402191,0.997302


# Recommendation System

In [3]:
def load_nrc_lexicon():
    emotion_lexicon = defaultdict(list)
    lexicon_file = "NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
    

    with open(lexicon_file, 'r') as file:
        for line in file:
            word, emotion, association = line.strip().split('\t')
            if int(association) == 1:
                emotion_lexicon[word].append(emotion)
    return emotion_lexicon

def preprocess_text(text):
    punctuations = '\'"\\,<>./?@#$%^&*_~/!()-[]{};:'
    text = ''.join([char for char in text if char not in punctuations])
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

def normalize_distribution(distribution):
    total = sum(distribution.values())
    return {emotion: value / total for emotion, value in distribution.items()} if total > 0 else distribution

def process_user_emotion_input(user_input, emotion_lexicon):
    tokens = preprocess_text(user_input)
    emotion_count = defaultdict(int)

    for word in tokens:
        if word in emotion_lexicon:
            for emotion in emotion_lexicon[word]:
                emotion_count[emotion] += 1

    return normalize_distribution(emotion_count)

def get_emotion_vector(emotion_dict, emotions_list):
    return np.array([emotion_dict.get(emotion, 0) for emotion in emotions_list])

def recommend_songs(user_vector, emotion_df, top_n=10):
    # Recommend songs based on cosine similarity between the user's emotion vector and song emotion vectors
    emotion_matrix = np.array(emotion_df['emotion_vector'].tolist())
    similarities = cosine_similarity(user_vector.reshape(1, -1), emotion_matrix).flatten()
    emotion_df['cosine_similarity'] = similarities
    return emotion_df.nlargest(top_n, 'cosine_similarity')[['title', 'artist', 'cosine_similarity']]

# Load the NRC lexicon
emotion_lexicon = load_nrc_lexicon()

# Load the dataset of songs and convert emotion distributions to dictionaries
emotions_df = pd.read_csv('emotion_assigned_songs.csv')
emotions_df['emotion_distribution'] = emotions_df['emotion_distribution'].apply(ast.literal_eval)

# Define the list of emotions used in the recommendation system
emotions = ['anticipation', 'joy', 'surprise', 'anger', 'disgust', 'fear', 'sadness', 'trust']

# Convert song emotion distributions to vectors
emotions_df['emotion_vector'] = emotions_df['emotion_distribution'].apply(lambda x: get_emotion_vector(x, emotions))

In [7]:
# Ask the user for input and process it into an emotion vector
user_input = input("How are you feeling today? ")
user_emotion_distribution = process_user_emotion_input(user_input, emotion_lexicon)
user_vector = get_emotion_vector(user_emotion_distribution, emotions)

# Get the top 10 recommended songs based on similarity
recommended_songs = recommend_songs(user_vector, emotions_df).reset_index()

# Print the recommendations
print("\n" + "-" * 50 + "\n")
print("Top 10 Songs Based on Similarity:")
for idx, row in recommended_songs.iterrows():
    print(f"{idx + 1}. {row['title']} by {row['artist']} - Similarity: {row['cosine_similarity']:.4f}")



--------------------------------------------------

Top 10 Songs Based on Similarity:
1. Sweet Scarlet by Cat Stevens - Similarity: 1.0000
2. Savannah by LP - Similarity: 1.0000
3. Nails Done by Amanda Lepore - Similarity: 1.0000
4. Hometown Farewell Kiss by The Triffids - Similarity: 1.0000
5. Shes Your Baby by Ween - Similarity: 1.0000
6. Good Luck by Kristy Lee Cook - Similarity: 0.9999
7. Rainy Day Women 12  35 by Bob Dylan - Similarity: 0.9999
8. Dream a Little Dream of Me by Louis Armstrong - Similarity: 0.9998
9. When You Came by Rockapella - Similarity: 0.9996
10. Black Coffee by Heavy D - Similarity: 0.9996
