In [57]:
import pandas as pd
from json import loads, dumps, dump
import unidecode
from langchain.text_splitter import RecursiveCharacterTextSplitter

df = pd.read_csv('data/taylor_swift_lyrics.csv', sep=',', header=0, encoding='windows-1252')


In [30]:
songs = loads(df.to_json(orient="index"))

In [145]:
songs['0']

{'artist': 'Taylor Swift',
 'album': 'Taylor Swift',
 'track_title': 'Tim McGraw',
 'track_n': 1,
 'lyric': 'He said the way my blue eyes shined',
 'line': 1,
 'year': 2006}

## Chunking

In [159]:
def make_chunks(full_lyrics):
    chunk_size = 150
    chunk_overlap_part = 4
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_size/chunk_overlap_part,
        length_function=len,
        is_separator_regex=False,
        keep_separator=False,
        separators = ["\n", ". ", " ", ""]
    )

    docs = text_splitter.create_documents([full_lyrics])
    return [{"line":doc.page_content} for doc in docs]

In [160]:
max_ind = len(songs)

previous_id = 1
artist = "Taylor Swift"
lyrics = []
song = {}

song["artist"] = artist
song["id"] = previous_id-1
song["name"] = songs['0']["track_title"]
song["album"] = songs['0']["album"]
full_lyrics = ""

processed_songs = []

for i in range(max_ind):
    line = songs[str(i)]
    
    name = unidecode(line["track_title"])
    album = unidecode(line["album"])
    id = line["track_n"]

    if id == previous_id:
        line = unidecode(line["lyric"])
        lyrics.append({"line" : line})
        full_lyrics = full_lyrics + line + " \n "
    else:
        previous_id = id
        song["lyrics"] = lyrics
        song["full_lyrics"] = {"line" : full_lyrics}
        chunks = make_chunks(full_lyrics)
        song["chunks"] = chunks
        processed_songs.append(song)

        song = {}
        lyrics = []
        full_lyrics = ""
        song["artist"] = artist
        song["name"] = name
        song["album"] = album
        song["id"] = id-1


In [165]:
import numpy as np

example = processed_songs[0]

print(f'The lyrics of this song have a length of {len(example["full_lyrics"]["line"])} characters')
sizes = []
for chunk in example["chunks"]:
    sizes.append(len(chunk["line"]))
print(f'We have {len(example["chunks"])} chunks with an average lenght of {np.round(np.mean(sizes), 2)} characters')

sizes = []
for line in example["lyrics"]:
    sizes.append(len(line["line"]))

print(f'We have {len(example["lyrics"])} lines of lyrics with an average lenght of {np.round(np.mean(sizes),2)} characters')

The lyrics of this song have a length of 1888 characters
We have 17 chunks with an average lenght of 130.12 characters
We have 55 lines of lyrics with an average lenght of 31.33 characters


In [166]:
with open('data/ts_song.json', 'w') as f:
    dump(processed_songs, f)

In [147]:
from pydantic import BaseModel

class Song(BaseModel):
    name: str
    artist: str
    lyrics: list
    id : int
    album : str

    model_config = {
        "json_schema_extra": {
            "examples": [
                {
                    "name": "Take me to church",
                    "artist": "Hozier",
                    "album" : "Hozier",
                    "lyrics": ["Her eyes and words are so icy", "Oh but she burns", "Like rum on the fire", ...],
                    "id": 0,
                }
            ]
        }
    }
    

In [148]:
for song in processed_songs:
    Song.model_validate(song)