Filter out non lyrics -> Preprocess the raw lyrics -> Detect language -> Keep only english speaking songs -> Add polarity -> Remove stopwords -> Add polarity again

In [None]:
# Imports
import pandas as pd
import re
import spacy
from langdetect import detect
from textblob import TextBlob
nlp = spacy.load('en_core_web_sm')

# Functions
def filter_lyrics_by_keywords(df):

    # Use lyric keywords to filter out non-lyric lines
    keywords = ['[Intro', '[Chorus', '[Verse', '[Intro:', '[Chorus:', '[Verse:', '[Bridge', '[Bridge:', '[Outro', '[Outro:']
    escaped_keywords = [re.escape(keyword) for keyword in keywords]
    df_filtered = df[df['lyrics'].str.contains('|'.join(escaped_keywords), case=False, na=False)]
    
    return df_filtered

def clean_lyrics(lyrics):

    # Convert to string
    lyrics = str(lyrics)

    # Remove everything in []
    lyrics = re.sub(r'\[.*?\]', '', lyrics)

    # Remove everything that is not a letter a-z, A-Z
    lyrics = re.sub(r'[^a-zA-Z\s]', '', lyrics)

    # Remove empty spaces and convert to lowercase
    lyrics = re.sub(r'[\n\r]+', ' ', lyrics)
    lyrics = re.sub(r'\s+', ' ', lyrics)
    lyrics = lyrics.lower()
    lyrics = lyrics.strip()

    return lyrics

def detect_language_in_df(df):

    # Filter out short lyrics
    df = df.loc[df['lyrics'].str.len() > 350].copy()

    # Create a new column with the detected language
    df.loc[:, 'language_detected'] = df['lyrics'].apply(
        lambda x: detect(x) if isinstance(x, str) and pd.notna(x) else 'none'
    )
    
    return df

def get_polarity(lyrics):
    blob = TextBlob(lyrics)
    return blob.sentiment.polarity

# Read CSV files
df = pd.read_csv('/Users/juliusriss/Desktop/data-science-project-local/data/usa_17-24_with_lyrics.csv')

# Filter out non-lyric lines
df = filter_lyrics_by_keywords(df)

# Clean the lyrics (empty spaces, etc.)
df['lyrics'] = df['lyrics'].apply(clean_lyrics)

# Detect the language of the lyrics
df = detect_language_in_df(df)
df.to_csv('/Users/juliusriss/Desktop/data-science-project-local/data/usa_17-24_with_language.csv', index=False)

# Only keep english-speaking lyrics
df = df[df['language_detected'] == 'en']

# Add polarity
df['polarity'] = df['lyrics'].apply(get_polarity)

# Add location
df['location'] = 'Global'

# Save the cleaned data
df.to_csv('/Users/juliusriss/Desktop/data-science-project-local/data/usa_17-24_with_polarity.csv', index=False)