# Load lyrics data

In [1]:
import string
import csv
import pandas as pd

# Filter for english songs

In [None]:
# Initialize an empty DataFrame
full_df = pd.DataFrame()

chunk_size = 100000  # Number of rows per chunk
for chunk in pd.read_csv('song_lyrics.csv', chunksize=chunk_size):
    # Filter the chunk to keep only rows where language is 'en'
    filtered_chunk = chunk[chunk['language'] == 'en']
    
    # Append the filtered chunk to the full DataFrame
    full_df = pd.concat([full_df, filtered_chunk], ignore_index=True)

    # Print the current number of rows in the DataFrame
    print(f'Number of rows after processing chunk: {len(full_df)}')

full_df.to_csv('english songs.csv', index=False)

# Clean data

In [None]:
df = pd.read_csv('/Users/jonnycodd/Documents/MASTERS/Text mining/Project/english songs.csv')

pd.set_option('display.max_rows', None)

# Display value counts for the 'year' column
print(df['year'].value_counts())

# Reset display.max_rows to its default (usually 60)
pd.reset_option('display.max_rows')

In [None]:
# Clean artist and title 
# Replace hyphens with spaces in 'Title' and 'Main_Artist' in 'charts' DataFrame
charts['Title'] = charts['Title'].str.replace('-', ' ', regex=False)
charts['Main_Artist'] = charts['Main_Artist'].str.replace('-', ' ', regex=False)

# Replace hyphens with spaces in 'title' and 'artist' in 'df' DataFrame
df['title'] = df['title'].str.replace('-', ' ', regex=False)
df['artist'] = df['artist'].str.replace('-', ' ', regex=False)

def remove_punctuation(s):
    if isinstance(s, str):
        return s.translate(str.maketrans('', '', string.punctuation))
    else:
        # Return as is if not a string (e.g., NaN or numeric)
        return s

# Clean 'Title' and 'Main_Artist' in 'charts' DataFrame
charts['Title'] = charts['Title'].str.lower().map(remove_punctuation).str.strip()
charts['Main_Artist'] = charts['Main_Artist'].str.lower().map(remove_punctuation).str.strip()

# Clean 'title' and 'artist' in 'df' DataFrame
df['title'] = df['title'].str.lower().map(remove_punctuation).str.strip()
df['artist'] = df['artist'].str.lower().map(remove_punctuation).str.strip()

In [None]:
# keep relevent rows
columns_to_keep = ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics']

# Select only the specified columns from 'df'
df = df[columns_to_keep]


In [None]:
df = df.drop_duplicates(subset=['title', 'artist'])

In [None]:
df = df.rename(columns={'year': 'release year'})

In [None]:
# Keep top 4000 songs per year in terms of views
top_songs_per_year = df.groupby('release year').apply(lambda x: x.nlargest(4000, 'views')).reset_index(drop=True)
top_songs_per_year

In [None]:
top_songs_per_year.to_csv('top songs.csv', index=False)

# Define functions

In [11]:
#additional functions
def strip(word):
    mod_string = re.sub(r'\W+', '', word)
    return mod_string

#the following leaves in place two or more capital letters in a row
#will be ignored when using standard stemming
def abbr_or_lower(word):
    if re.match('([A-Z]+[a-z]*){2,}', word):
        return word
    else:
        return word.lower()

#modular pipeline for stemming, lemmatizing and lowercasing
#note this is NOT lemmatizing using grammar pos
    
def tokenize(text, modulation):
    if modulation<2:
        tokens = re.split(r'\W+', text)
        stems = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            lowers=abbr_or_lower(token)
            if lowers not in stop_words:
                if re.search('[a-zA-Z]', lowers):
                    if modulation==0:
                        stems.append(lowers)
                    if modulation==1:
                        stems.append(porter.stem(lowers))
    else:
        sp_text=sp(text)
        stems = []
        lemmatized_text=[]
        for word in sp_text:
            lemmatized_text.append(word.lemma_)
        stems = [abbr_or_lower(strip(w)) for w in lemmatized_text if (abbr_or_lower(strip(w))) and (abbr_or_lower(strip(w)) not in stop_words)]
    return " ".join(stems)


def vectorize(tokens, vocab):
    vector=[]
    for w in vocab:
        vector.append(tokens.count(w))
    return vector

In [12]:
# Define pre posessing pipleines
def preprocess_text_data(corpus_data, mod, column):

    text_preproc = corpus_data[column].astype(str).progress_apply(lambda row: tokenize(row, mod))
    
    corpus_data[column] = text_preproc

    corpus_data.dropna(subset=[column], inplace=True)
    corpus_data.reset_index(drop=True, inplace=True)
    
    # Convert to string
    corpus_data[column] = corpus_data[column].astype(str)

    return corpus_data


# Load data

In [10]:
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()
import numpy as np
from matplotlib import pyplot as plt

from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer 
from nltk.corpus import stopwords

import spacy
sp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [14]:
#getting a library of stopwords and defining a lemmatizer
porter=SnowballStemmer("english")
lmtzr = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

In [6]:
lyrics_df = pd.read_csv('/Users/jonnycodd/Documents/MASTERS/Text mining/Project/top songs.csv')

In [7]:
lyrics_df 

Unnamed: 0,title,tag,artist,release year,views,features,lyrics
0,let it be,rock,the beatles,1970,1481859,{},[Verse 1]\nWhen I find myself in times of trou...
1,your song,rock,elton john,1970,1323166,{},"[Verse 1]\nIt's a little bit funny, this feeli..."
2,paranoid,rock,black sabbath,1970,508767,{},[Intro]\n\n[Verse 1]\nFinished with my woman\n...
3,immigrant song,rock,led zeppelin,1970,494355,{},"[Intro]​\n(2, 3, 4)\n\n[Chorus]\nAhh! Ahh!\nWe..."
4,have you ever seen the rain,rock,creedence clearwater revival,1970,468949,{},[Verse 1]\nSomeone told me long ago\nThere's a...
...,...,...,...,...,...,...,...
191995,attention female perspective,pop,andie case,2017,34129,{},"[Verse 1]\nI been runnin' 'round, runnin' 'rou..."
191996,​nobody 😔,rap,shinigami,2017,34115,{​shinigami},[Verse 1]\nTake me away from this path that I ...
191997,my time,rap,wolves,2017,34114,{},I'm ready to play now\nPut me in the game now\...
191998,down in flames,pop,ella vos,2017,34113,{},[Verse 1]\nSlippin' off the edge\nOut of phase...


# Clean song lyrics

In [8]:
# Remove text inside square brackets
lyrics_df['lyrics'] = lyrics_df['lyrics'].str.replace(r'\[.*?\]', '', regex=True)

# Replace \n with a space 
lyrics_df['lyrics'] = lyrics_df['lyrics'].str.replace('\n', ' ', regex=False)

In [15]:
lyrics_df = preprocess_text_data(lyrics_df, mod = 2, column = 'lyrics')

 13%|█▎        | 24836/192000 [12:57<1:44:19, 26.70it/s]

In [None]:
lyrics_df = lyrics_df.dropna(subset=['lyrics'])
lyrics_df.to_csv('../../data/top songs processed.csv', index=False)