# Load lyrics data

In [1]:
import string
import csv
import pandas as pd

# Filter for english songs

In [None]:
# Initialize an empty DataFrame
full_df = pd.DataFrame()

chunk_size = 100000  # Number of rows per chunk
for chunk in pd.read_csv('song_lyrics.csv', chunksize=chunk_size):
    # Filter the chunk to keep only rows where language is 'en'
    filtered_chunk = chunk[chunk['language'] == 'en']
    
    # Append the filtered chunk to the full DataFrame
    full_df = pd.concat([full_df, filtered_chunk], ignore_index=True)

    # Print the current number of rows in the DataFrame
    print(f'Number of rows after processing chunk: {len(full_df)}')

full_df.to_csv('english songs.csv', index=False)

# Clean data

In [16]:
df = pd.read_csv('/Users/jonnycodd/Documents/MASTERS/Text mining/Project/english songs.csv')

pd.set_option('display.max_rows', None)

# Display value counts for the 'year' column
print(df['year'].value_counts())

# Reset display.max_rows to its default (usually 60)
pd.reset_option('display.max_rows')

year
2015    267503
2013    116348
2016    109859
2014     97957
2017     91676
2012     67062
2011     56778
2010     44410
2009     43843
2008     43026
2006     41787
2007     40757
2005     39348
2004     37494
2003     36273
2002     32877
2001     31007
2000     28602
1999     27684
1998     24503
1997     22342
1996     21119
1995     18925
1994     16935
1993     15155
1992     12716
1991     11979
1990     10371
1989      9889
1988      8750
1987      7922
1986      7040
1985      6589
1984      6275
1983      6209
1982      6179
1981      5766
1980      5497
1979      5196
1978      4837
1973      4504
1977      4504
1972      4457
1970      4418
1975      4364
1971      4356
1976      4195
1974      4141
Name: count, dtype: int64


In [18]:
# Replace hyphens with spaces in 'title' and 'artist' in 'df' DataFrame
df['title'] = df['title'].str.replace('-', ' ', regex=False)
df['artist'] = df['artist'].str.replace('-', ' ', regex=False)

def remove_punctuation(s):
    if isinstance(s, str):
        return s.translate(str.maketrans('', '', string.punctuation))
    else:
        # Return as is if not a string (e.g., NaN or numeric)
        return s



# Clean 'title' and 'artist' in 'df' DataFrame
df['title'] = df['title'].str.lower().map(remove_punctuation).str.strip()
df['artist'] = df['artist'].str.lower().map(remove_punctuation).str.strip()

In [19]:
# keep relevent rows
columns_to_keep = ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics']

# Select only the specified columns from 'df'
df = df[columns_to_keep]


In [20]:
df = df.drop_duplicates(subset=['title', 'artist'])

In [21]:
df = df.rename(columns={'year': 'release year'})

In [22]:
# Keep top 4000 songs per year in terms of views
top_songs_per_year = df.groupby('release year').apply(lambda x: x.nlargest(1000, 'views')).reset_index(drop=True)
top_songs_per_year

Unnamed: 0,title,tag,artist,release year,views,features,lyrics
0,let it be,rock,the beatles,1970,1481859,{},[Verse 1]\nWhen I find myself in times of trou...
1,your song,rock,elton john,1970,1323166,{},"[Verse 1]\nIt's a little bit funny, this feeli..."
2,paranoid,rock,black sabbath,1970,508767,{},[Intro]\n\n[Verse 1]\nFinished with my woman\n...
3,immigrant song,rock,led zeppelin,1970,494355,{},"[Intro]​\n(2, 3, 4)\n\n[Chorus]\nAhh! Ahh!\nWe..."
4,have you ever seen the rain,rock,creedence clearwater revival,1970,468949,{},[Verse 1]\nSomeone told me long ago\nThere's a...
...,...,...,...,...,...,...,...
47995,glory,rock,dermot kennedy,2017,166391,{},[Verse 1]\nFor all the moments never known\n'C...
47996,these girls,pop,why dont we,2017,165958,"{""Why Don\\'t We""}",[Verse: Daniel Seavey & Jonah Marais]\nI've be...
47997,aap ferg,rap,nav metro boomin,2017,165924,"{""Lil Uzi Vert""}","[Intro: Lil Uzi Vert, NAV & Future]\nIf Young ..."
47998,tomorrow til infinity,rap,young thug,2017,165884,{Gunna},[Intro: Young Thug & Gunna]\nHope I see you (Y...


In [23]:
top_songs_per_year.to_csv('top songs.csv', index=False)

# Define functions

In [24]:
#additional functions
def strip(word):
    mod_string = re.sub(r'\W+', '', word)
    return mod_string

#the following leaves in place two or more capital letters in a row
#will be ignored when using standard stemming
def abbr_or_lower(word):
    if re.match('([A-Z]+[a-z]*){2,}', word):
        return word
    else:
        return word.lower()

#modular pipeline for stemming, lemmatizing and lowercasing
#note this is NOT lemmatizing using grammar pos
    
def tokenize(text, modulation):
    if modulation<2:
        tokens = re.split(r'\W+', text)
        stems = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            lowers=abbr_or_lower(token)
            if lowers not in stop_words:
                if re.search('[a-zA-Z]', lowers):
                    if modulation==0:
                        stems.append(lowers)
                    if modulation==1:
                        stems.append(porter.stem(lowers))
    else:
        sp_text=sp(text)
        stems = []
        lemmatized_text=[]
        for word in sp_text:
            lemmatized_text.append(word.lemma_)
        stems = [abbr_or_lower(strip(w)) for w in lemmatized_text if (abbr_or_lower(strip(w))) and (abbr_or_lower(strip(w)) not in stop_words)]
    return " ".join(stems)


def vectorize(tokens, vocab):
    vector=[]
    for w in vocab:
        vector.append(tokens.count(w))
    return vector

In [25]:
# Define pre posessing pipleines
def preprocess_text_data(corpus_data, mod, column):

    text_preproc = corpus_data[column].astype(str).progress_apply(lambda row: tokenize(row, mod))
    
    corpus_data[column] = text_preproc

    corpus_data.dropna(subset=[column], inplace=True)
    corpus_data.reset_index(drop=True, inplace=True)
    
    # Convert to string
    corpus_data[column] = corpus_data[column].astype(str)

    return corpus_data


# Load data

In [26]:
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()
import numpy as np
from matplotlib import pyplot as plt

from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer 
from nltk.corpus import stopwords

import spacy
sp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [27]:
#getting a library of stopwords and defining a lemmatizer
porter=SnowballStemmer("english")
lmtzr = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

In [32]:
lyrics_df = pd.read_csv('top songs.csv')

In [33]:
lyrics_df 

Unnamed: 0,title,tag,artist,release year,views,features,lyrics
0,let it be,rock,the beatles,1970,1481859,{},[Verse 1]\nWhen I find myself in times of trou...
1,your song,rock,elton john,1970,1323166,{},"[Verse 1]\nIt's a little bit funny, this feeli..."
2,paranoid,rock,black sabbath,1970,508767,{},[Intro]\n\n[Verse 1]\nFinished with my woman\n...
3,immigrant song,rock,led zeppelin,1970,494355,{},"[Intro]​\n(2, 3, 4)\n\n[Chorus]\nAhh! Ahh!\nWe..."
4,have you ever seen the rain,rock,creedence clearwater revival,1970,468949,{},[Verse 1]\nSomeone told me long ago\nThere's a...
...,...,...,...,...,...,...,...
47995,glory,rock,dermot kennedy,2017,166391,{},[Verse 1]\nFor all the moments never known\n'C...
47996,these girls,pop,why dont we,2017,165958,"{""Why Don\\'t We""}",[Verse: Daniel Seavey & Jonah Marais]\nI've be...
47997,aap ferg,rap,nav metro boomin,2017,165924,"{""Lil Uzi Vert""}","[Intro: Lil Uzi Vert, NAV & Future]\nIf Young ..."
47998,tomorrow til infinity,rap,young thug,2017,165884,{Gunna},[Intro: Young Thug & Gunna]\nHope I see you (Y...


# Clean song lyrics

In [34]:
# Remove text inside square brackets
lyrics_df['lyrics'] = lyrics_df['lyrics'].str.replace(r'\[.*?\]', '', regex=True)

# Replace \n with a space 
lyrics_df['lyrics'] = lyrics_df['lyrics'].str.replace('\n', ' ', regex=False)

In [35]:
lyrics_df = preprocess_text_data(lyrics_df, mod = 2, column = 'lyrics')

  0%|          | 0/48000 [00:00<?, ?it/s]

100%|██████████| 48000/48000 [46:32<00:00, 17.19it/s]  


In [36]:
lyrics_df = lyrics_df.dropna(subset=['lyrics'])
lyrics_df.to_csv('../../data/top songs processed.csv', index=False)