- comprobar monorrimo (a+)
- comprobar sin rima (\-+)
- dividir cadena rima en grupos de 4
- por cada grupo, comprobar:
    + enclosed rhyme - rima abrazada abba 
    + cross - rima cruzada abab
- dividir en grupos de 2    
    - pair - rima pareada aa
- contar porcentaje de cada tipo
- definir tipos de canciones por rima


In [1]:
import re
import pandas as pd
from collections import Counter

In [2]:
MONO_RE = re.compile(r"a+", re.VERBOSE | re.I) #aaaa
PAIR_RE = re.compile(r"([a-zñ])\1{1,}", re.VERBOSE | re.I) #aa
CROSS_RE = re.compile(r"([a-zñ])(?!\1)([a-zñ])\1\2", re.VERBOSE | re.I) # abab
ENCLOSED_RE = re.compile(r"([a-zñ])(?!\1)([a-zñ])\2\1", re.VERBOSE | re.I) #abba
NO_RHYME_RE = re.compile(r"\-+", re.VERBOSE | re.I) #----

In [3]:
def calc_percents(rhyme:str, counts: dict) -> dict:
    total = len(rhyme)
    percents = {} 
    for key, value in counts.items():
        percents[key] = (value/total) * 100 if total>0 else 0
    return percents

In [4]:
def split_and_count(rhyme, step, regex): 
    count=0
    for i in range(0, len(rhyme), step):
        chunk = rhyme[i:i+step]
        if re.fullmatch(regex, chunk):
            count += step 
    return count

In [5]:
def count_couplet(rhyme):
    count=0
    for match in re.finditer(PAIR_RE, rhyme):
        span_len = match.end() - match.start()
        if span_len % 2 == 0:
            count += span_len
    return count

In [6]:
def count_rhyme(rhyme: str):
    counts = {"monorhyme": 0,
             "crossed_rhyme": 0,
             "enclosed_rhyme": 0,
             "couplet": 0,
             "no_rhyme": 0}
    is_mono = re.fullmatch(MONO_RE, rhyme)
    if is_mono:
        counts["monorhyme"] = len(rhyme)
    else:
        counts["no_rhyme"] = Counter(rhyme)['-']
        counts["enclosed_rhyme"] = split_and_count(rhyme, 4, ENCLOSED_RE)
        counts["crossed_rhyme"] = split_and_count(rhyme, 4, CROSS_RE)
        if counts["enclosed_rhyme"] == 0 and counts["crossed_rhyme"] == 0:
            counts["couplet"] = count_couplet(rhyme)
    percents = calc_percents(rhyme, counts)
    return percents

In [7]:
df = pd.read_parquet("../rantanplan-data/rhyme-stress/spotify-rantanplan-lyrics.parquet")

In [8]:
df = df.loc[df["spotify_id"].notnull()]

In [9]:
df = df.reset_index(drop=True)

In [10]:
counts_rhyme = [count_rhyme(x) for x in df["rhyme"]]

In [11]:
total = df.join(pd.json_normalize(counts_rhyme))

In [12]:
total.to_parquet("../spotify-rhyme-groups.parquet", compression="gzip", index=False)

In [13]:
total.head()

Unnamed: 0,id,artist_name,track_name,lyrics,danceability,energy,key,loudness,mode,speechiness,...,valence,tempo,spotify_id,rhyme,stress,monorhyme,crossed_rhyme,enclosed_rhyme,couplet,no_rhyme
0,0,Los Chunguitos,Dame veneno,Dame veneno q quiero morir dame veneno\nq ante...,0.504,0.817,10.0,-4.918,1.0,0.134,...,0.787,175.747,1pFQSYFg9xZSUZLA6Om0R5,abcabc-d-dee--fghfgh-i-ijk-kjlmjlm,"[+--+--+--++--+-, -+--+--+-, --+-+-+--+----+-,...",0.0,0.0,0.0,5.882353,20.588235
1,1,Los Chunguitos,Cuando La Veo Pasar,"Olee, vamos con ella:\nTodos los días paso por...",0.661,0.702,6.0,-8.717,0.0,0.0452,...,0.873,84.906,1fg4jlqtMqvY1cTje7Omx8,-a-a-b-bccc-ddeefff-,"[-++--+-, +--++---+-, -+--+-, -+-+-+--+-, --+-...",0.0,0.0,0.0,20.0,30.0
2,2,Alianza,Voy De Frente,Me propuse nadar contra la corriente.\n(Jamás ...,0.633,0.791,9.0,-7.939,1.0,0.0289,...,0.589,127.23,5iryPxl257pwf7mkRhIgoc,abcbacb---bd-edededededededed-,"[--+--+----+-, -+-+--+-, +-+--++--+-, ---+--+-...",0.0,40.0,0.0,0.0,16.666667
3,3,Alianza,Ven a Calmar Mi Dolor,¡Hey! ¡Hey!\n¡Hey! ¡Hey!\n¡Hey! ¡Hey!\nSi vien...,0.589,0.856,4.0,-5.063,0.0,0.033,...,0.728,109.242,6DsjIv44nrx6WCvym63Bcv,aaa-a-abbababaabbbaabbbaa-,"[+-, +-, +-, -+--+-, +++--+-, -+-++-, -+-+++-,...",0.0,15.384615,15.384615,0.0,11.538462
4,4,Alianza,Veni a Escuchar Rock And Roll,"Todo ya te pudrió,\nel diario de ayer ya te ca...",0.595,0.873,2.0,-6.106,1.0,0.03,...,0.608,110.976,4sjZfPsz45FgmzyqZJtgAU,aaabbabbaaaa-cccccccddddddddedeeddddddddd-,"[+-+--+-, -+--++--+-, -+----+-, -+--+-+++-, -+...",0.0,0.0,0.0,42.857143,4.761905
