# Description requests

## Setup

In [2]:
# Import modules
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


In [None]:
# API connexion
client_id = 'cec8979c027344f98b471a991aa415ad'
client_secret = 'ad69bcba55b349f98c1344b006c708bd'

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


## Get playlists description

In [None]:
# Load data
df = pd.read_excel('/Users/julienmbarki/Documents/Doctorat/Publications/Article 2/Data/Code/data_requests/playlists_info_clean.xlsx')

# Remove playlists with "This is" and "Written By" in the name
df = df[~df['playlist_name'].str.contains('This Is', na=False)]
df = df[~df['playlist_name'].str.contains('Written By', na=False)]


In [None]:
# API scraping
playlists_ids = df['playlist_id'].tolist()

playlists = []

for playlist_id in playlists_ids:
    print(f"Processing playlist: {playlist_id}")

    playlists_results = sp.playlist(playlist_id)

    playlist_name = playlists_results['name']
    playlist_description = playlists_results['description']

    playlists.append({
        "playlist_id": playlist_id, 
        "playlist_name": playlist_name,
        "playlist_description": playlist_description
    })

# Create dataframe from playlists
df_playlists = pd.DataFrame(playlists, columns=["playlist_id", "playlist_name", "playlist_description"])
df_playlists.to_excel("playlists_descriptions.xlsx", index=False)

print(df_playlists)


## Process playlists descrption with text analysis

In [None]:
# Load data
df = pd.read_excel('playlists_descriptions.xlsx')


In [None]:
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Step 1: Preprocess text
# Combine English and French stopwords and add custom words
custom_stopwords = set(stopwords.words('english'))  # Default English stopwords
custom_stopwords.update(stopwords.words('french'))  # Add French stopwords
custom_stopwords.update(['cover', 'photo'])  # Add specific words to exclude
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):  # Check if the text is a string
        tokens = word_tokenize(text.lower())  # Lowercase and tokenize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
        tokens = [word for word in tokens if word not in custom_stopwords]  # Remove stopwords
        return tokens
    else:
        return []  # Return an empty list if the text is not a string (e.g., NaN)

df['processed_description'] = df['playlist_description'].apply(preprocess_text)

# Step 2: Create a dictionary and corpus
dictionary = corpora.Dictionary(df['processed_description'])
corpus = [dictionary.doc2bow(text) for text in df['processed_description']]

# Step 3: Apply LDA model
num_topics = 4  # Number of topics to extract
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=0)

# Display topics and top words in each topic
topics = lda_model.print_topics(num_words=5)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

# Step 4: Assign the dominant topic to each description
def get_dominant_topic(text):
    bow = dictionary.doc2bow(text)
    topic_probs = lda_model.get_document_topics(bow)
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
    return dominant_topic

df['dominant_topic'] = df['processed_description'].apply(get_dominant_topic)

# Display the DataFrame with dominant topics
print(df[['playlist_description', 'dominant_topic']])


## Merge with panel data

In [3]:
# Load data
df = pd.read_excel('playlists_descriptions.xlsx')
df_2 = pd.read_excel('df_final.xlsx')


In [4]:
# Merge df1 into df2 based on 'playlist_name'
df_2 = df_2.merge(df[['playlist_name', 'playlist_curator']], on='playlist_name', how='left')

# Display the merged DataFrame
display(df_2)

# Export the merged DataFrame to an Excel file
df_2.to_excel('df_final_bis.xlsx', index=False)


Unnamed: 0,playlist_name,playlist_followers,playlist_owner,track_name,track__popularity,track_release_date,artist_name,artist_popularity,danceability,energy,...,hh_index,hh_index_2,distances,distances_2,distances_3,stirling_index,stirling_index_2,type,curator,playlist_curator
0,&ME's track IDs,61240,Spotify,Je Préfère Ça,0,2024-02-23,Marine Neuilly,16,0.743,0.85400,...,5102.040816,1767.346939,0.294148,0.892910,0.899610,[[0.14407242]],[[0.3124742]],genre,spotify,external
1,&ME's track IDs,61240,Spotify,A.M.,13,2011-03-06,Flowers and Sea Creatures,19,0.644,0.53700,...,5102.040816,1767.346939,0.294148,0.892910,0.899610,[[0.14407242]],[[0.3124742]],genre,spotify,external
2,&ME's track IDs,61240,Spotify,After Saturday Night - Manoo Alternative Remix,30,2023-10-27,Sparrow & Barbossa,52,0.683,0.86000,...,5102.040816,1767.346939,0.294148,0.892910,0.899610,[[0.14407242]],[[0.3124742]],genre,spotify,external
3,&ME's track IDs,61240,Spotify,Alf Youm,28,2024-05-03,ReiRei,17,0.691,0.64000,...,5102.040816,1767.346939,0.294148,0.892910,0.899610,[[0.14407242]],[[0.3124742]],genre,spotify,external
4,&ME's track IDs,61240,Spotify,All I Got,66,2024-09-27,Masšh,52,0.611,0.85500,...,5102.040816,1767.346939,0.294148,0.892910,0.899610,[[0.14407242]],[[0.3124742]],genre,spotify,external
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124381,zen garden,134555,Spotify,Voltana Stream,43,2022-10-21,Bellezza Eterna,45,0.122,0.07370,...,6220.060967,2149.804035,0.318654,0.809618,0.853568,[[0.1204494]],[[0.28349945]],mood,spotify,spotify
124382,zen garden,134555,Spotify,Whispering Dryad,50,2022-11-19,Ashcan Johansen,41,0.196,0.16000,...,6220.060967,2149.804035,0.318654,0.809618,0.853568,[[0.1204494]],[[0.28349945]],mood,spotify,spotify
124383,zen garden,134555,Spotify,With the Flow,53,2024-03-10,Park Leaf,40,0.107,0.14000,...,6220.060967,2149.804035,0.318654,0.809618,0.853568,[[0.1204494]],[[0.28349945]],mood,spotify,spotify
124384,zen garden,134555,Spotify,Zen Forest (Binaural),40,2022-10-15,Textures of Nature,37,0.335,0.14500,...,6220.060967,2149.804035,0.318654,0.809618,0.853568,[[0.1204494]],[[0.28349945]],mood,spotify,spotify
