# Description requests

## Setup

In [None]:
# Import modules
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


In [None]:
# API connexion
client_id = 'cec8979c027344f98b471a991aa415ad'
client_secret = 'ad69bcba55b349f98c1344b006c708bd'

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


## Get playlists description

In [None]:
# Load data
df = pd.read_excel('/Users/julienmbarki/Documents/Doctorat/Publications/Article 2/Data/Code/data_requests/playlists_info_clean.xlsx')

# Remove playlists with "This is" and "Written By" in the name
df = df[~df['playlist_name'].str.contains('This Is', na=False)]
df = df[~df['playlist_name'].str.contains('Written By', na=False)]


In [None]:
# API scraping
playlists_ids = df['playlist_id'].tolist()

playlists = []

for playlist_id in playlists_ids:
    print(f"Processing playlist: {playlist_id}")

    playlists_results = sp.playlist(playlist_id)

    playlist_name = playlists_results['name']
    playlist_description = playlists_results['description']

    playlists.append({
        "playlist_id": playlist_id, 
        "playlist_name": playlist_name,
        "playlist_description": playlist_description
    })

# Create dataframe from playlists
df_playlists = pd.DataFrame(playlists, columns=["playlist_id", "playlist_name", "playlist_description"])
df_playlists.to_excel("playlists_descriptions.xlsx", index=False)

print(df_playlists)


## Process playlists descrption with text analysis

In [None]:
# Load data
df = pd.read_excel('playlists_descriptions.xlsx')


In [9]:
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Step 1: Preprocess text
# Combine English and French stopwords and add custom words
custom_stopwords = set(stopwords.words('english'))  # Default English stopwords
custom_stopwords.update(stopwords.words('french'))  # Add French stopwords
custom_stopwords.update(['cover', 'photo'])  # Add specific words to exclude
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):  # Check if the text is a string
        tokens = word_tokenize(text.lower())  # Lowercase and tokenize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
        tokens = [word for word in tokens if word not in custom_stopwords]  # Remove stopwords
        return tokens
    else:
        return []  # Return an empty list if the text is not a string (e.g., NaN)

df['processed_description'] = df['playlist_description'].apply(preprocess_text)

# Step 2: Create a dictionary and corpus
dictionary = corpora.Dictionary(df['processed_description'])
corpus = [dictionary.doc2bow(text) for text in df['processed_description']]

# Step 3: Apply LDA model
num_topics = 4  # Number of topics to extract
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=0)

# Display topics and top words in each topic
topics = lda_model.print_topics(num_words=5)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

# Step 4: Assign the dominant topic to each description
def get_dominant_topic(text):
    bow = dictionary.doc2bow(text)
    topic_probs = lda_model.get_document_topics(bow)
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
    return dominant_topic

df['dominant_topic'] = df['processed_description'].apply(get_dominant_topic)

# Display the DataFrame with dominant topics
print(df[['playlist_description', 'dominant_topic']])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/julienmbarki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julienmbarki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/julienmbarki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/julienmbarki/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Topic 0: 0.016*"sound" + 0.014*"playlist" + 0.010*"music" + 0.010*"listen" + 0.007*"song"
Topic 1: 0.017*"song" + 0.014*"music" + 0.008*"piano" + 0.007*"netflix" + 0.007*"instrumental"
Topic 2: 0.021*"music" + 0.018*"new" + 0.015*"best" + 0.009*"hit" + 0.008*"rock"
Topic 3: 0.064*"track" + 0.056*"right" + 0.052*"update" + 0.035*"played" + 0.034*"daily"
                                   playlist_description  dominant_topic
0     The hottest Afropop, Afrobeats and Afro-Caribb...               0
1     Tracks popping off in the Afro scene. Cover: Q...               1
2     Le meilleur du rap africain francophone. Photo...               0
3     The tracks heating up the continent right now!...               3
4     Les plus belles voix de la musique africaine. ...               3
...                                                 ...             ...
1560  Your daily update of the most viral tracks rig...               3
1561  Your daily update of the most viral tracks rig...              

## Merge with panel data