Se desea obtener un modelo de ML para clasificar canciones entre trova y regueton.

In [15]:
letra_hacha='''
Ropa de marca, una cadena
La cartera llena y salió pa' la pista
Lleva la puteria en vena
Llega y exagera y anda fuera de vista
Los que hablen de ella le resbala
Esa jeva es bala de la que detona
Ella cierra con su mirada
Carita de malvada y boquita enfermona
Y cuando se emborracha, hacha
Que rico lo pacha, hacha
Esta pa' darle brocha, ocha
Está pa' darle hacha, hacha
Y cuando se emborracha
Que rico lo pacha
Esta pa' darle brocha
Está pa' darle hacha
Ella está pa' darle brocha
Pintarla yo como un pintor
Y despues darle leña como un leñador
Dandola, viendola y to' el mundo mirandola
Y unos dicen a moverse que llegó el avión
'''

In [2]:
# Cuente los caracteres
from unidecode import unidecode
letter = []
cnt = []
for l in unidecode(lyrics_hacha).lower():
    if l in letter:
        idx = letter.index(l)
        cnt[idx] += 1
    else:
        letter.append(l)
        cnt.append(1)

In [3]:
l_cnt = []
for i in range(len(letter)):
    l_cnt.append((cnt[i], letter[i]))
l_cnt[:4]

[(22, '\n'), (33, 'r'), (35, 'o'), (14, 'p')]

In [4]:
l_cnt.sort(reverse=True)
l_cnt[:5]

[(100, ' '), (98, 'a'), (60, 'e'), (41, 'l'), (35, 'o')]

In [5]:
# Lets create a function for that
def count_chars(lines):
    lines = lines.replace(" ", "").replace("\n", "")
    letter = []
    cnt = []
    for l in unidecode(lines).lower():
        if l in letter:
            idx = letter.index(l)
            cnt[idx] += 1
        else:
            letter.append(l)
            cnt.append(1)
    l_cnt = []
    for i in range(len(letter)):
        l_cnt.append((cnt[i], letter[i]))
    l_cnt.sort(reverse=True)
    return l_cnt

In [7]:
cnt_hacha = count_chars(lyrics_hacha)
cnt_hacha[:10]

[(98, 'a'),
 (60, 'e'),
 (41, 'l'),
 (35, 'o'),
 (33, 'r'),
 (26, 'n'),
 (26, 'd'),
 (26, 'c'),
 (19, 's'),
 (19, 'h')]

In [8]:
letra_angel = '''
Cuentan que cuando un silencio
Aparecía entre dos
Era que pasaba un ángel
Que les robaba la voz
Y hubo tal silencio el día
Que nos tocaba olvidar
Que de tal suerte yo todavía
No terminé de callar
Todo empezó en la sorpresa
En un encuentro casual
Pero la noche es traviesa
Cuando se teje el azar
Sin querer se hace una ofrenda
Que pacta con el dolor
O pasa un ángel
Se hace leyenda
Y se convierte en amor
Ahora comprendo
Cual era el ángel
Que entre nosotros pasó
Era el más terrible, el implacable
El más feroz
Ahora comprendo en total
Este silencio mortal
Ángel que pasa
Besa y te abraza
Ángel para un
Final'''

In [9]:
cnt_angel = count_chars(letra_angel)
cnt_angel[:10]

[(77, 'e'),
 (70, 'a'),
 (42, 'o'),
 (39, 'n'),
 (34, 'r'),
 (34, 'l'),
 (27, 's'),
 (23, 'u'),
 (22, 't'),
 (21, 'c')]

Cómo podríamos clasificar automáticamente los géneros usando ML?

Usemos ahora otra representación. Cada letra se representará como un vector de frecuencias de cada caracter

In [11]:
caracts = list('abcdefghijklmnopqrstuvwxyz')

vect_hacha = []
for l in caracts:
    vect_hacha.append(0)
    
for c in unidecode(lyrics_hacha).lower():
    if c in caracts:
        vect_hacha[caracts.index(c)] += 1

In [12]:
print(vect_hacha)

[98, 9, 26, 26, 60, 2, 3, 19, 16, 1, 0, 41, 11, 26, 35, 14, 6, 33, 19, 15, 17, 8, 0, 1, 10, 0]


In [13]:
# crear una función
caracts = list('abcdefghijklmnopqrstuvwxyz')

def get_lyrics_vector(text):
    result = []
    for l in caracts:
        result.append(0)

    for c in unidecode(text).lower():
        if c in caracts:
            result[caracts.index(c)] += 1
            
    return result

In [16]:
print(caracts)
print(get_lyrics_vector(letra_hacha))
print(get_lyrics_vector(letra_angel))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[98, 9, 26, 26, 60, 2, 3, 19, 16, 1, 0, 41, 11, 26, 35, 14, 6, 33, 19, 15, 17, 8, 0, 1, 10, 0]
[70, 9, 21, 14, 77, 3, 5, 6, 17, 1, 0, 34, 9, 39, 42, 13, 9, 34, 27, 22, 23, 5, 0, 0, 5, 5]


In [17]:
# normalicemos los vectores
def normalize(vect):
    total = sum(vect)
    result = []
    for v in vect:
        result.append(v / total)
    return result

In [18]:
print(caracts)
print(normalize(get_lyrics_vector(letra_hacha)))
print(normalize(get_lyrics_vector(letra_angel)))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[0.1975806451612903, 0.018145161290322582, 0.05241935483870968, 0.05241935483870968, 0.12096774193548387, 0.004032258064516129, 0.006048387096774193, 0.038306451612903226, 0.03225806451612903, 0.0020161290322580645, 0.0, 0.08266129032258064, 0.02217741935483871, 0.05241935483870968, 0.07056451612903226, 0.028225806451612902, 0.012096774193548387, 0.06653225806451613, 0.038306451612903226, 0.03024193548387097, 0.034274193548387094, 0.016129032258064516, 0.0, 0.0020161290322580645, 0.020161290322580645, 0.0]
[0.14285714285714285, 0.018367346938775512, 0.04285714285714286, 0.02857142857142857, 0.15714285714285714, 0.006122448979591836, 0.01020408163265306, 0.012244897959183673, 0.03469387755102041, 0.0020408163265306124, 0.0, 0.06938775510204082, 0.018367346938775512, 0.07959183673469387, 0.08571428571428572, 0.026530612244897958, 0.018367346938775512, 0.06938

In [19]:
# Uppssss ... formateemos la salida
def print_formatted(values):
    for v in values:
        print(f"{v:.2f}", end=',')
    print()

In [20]:
print(caracts)
vect_hacha = normalize(get_lyrics_vector(letra_hacha))
vect_angel = normalize(get_lyrics_vector(letra_angel))
print_formatted(vect_hacha)
print_formatted(vect_angel)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
0.20,0.02,0.05,0.05,0.12,0.00,0.01,0.04,0.03,0.00,0.00,0.08,0.02,0.05,0.07,0.03,0.01,0.07,0.04,0.03,0.03,0.02,0.00,0.00,0.02,0.00,
0.14,0.02,0.04,0.03,0.16,0.01,0.01,0.01,0.03,0.00,0.00,0.07,0.02,0.08,0.09,0.03,0.02,0.07,0.06,0.04,0.05,0.01,0.00,0.00,0.01,0.01,


In [21]:
# Implementemos la distancia euclideana entre dos vectores, para comparar entre canciones ...
import math 

def dist_euclid(v1, v2):
    total = 0
    for idx in range(len(v1)):
        total += (v1[idx]-v2[idx])**2
    return math.sqrt(total)

dist_euclid(vect_hacha, vect_angel)

0.08819817731051234

In [22]:
letra_bajanda = '''
Cuando el gato no está en casa
Empiezan los carnavales y las comparsas
Los ratones guarachean
Y se botan pa' la calle pa' que to' el mundo los vea
Una pila 'e ratas flacas de cloaca
Arrollando por to' el muro 'e la Malenca'
Los ratones calentando La Piragüa
Y las ratas La Tribuna 'siéndose los repas
Gatos dicen miau
Miau, miau, miau
Eso es el gato y él es súper asfixia'o
Anda con un tigre, y con dos leones
Barriendo a las ratas
Y a to' los ratones
Gatos dicen miau
Miau, miau, miau
Eso es el gato y él es súper asfixiao'
Anda con un tigre, y con dos leones
Barriendo a las ratas
Y a to' los ratones
'''

In [23]:
vect_bajanda = normalize(get_lyrics_vector(letra_bajanda))
print(dist_euclid(vect_bajanda, vect_hacha))
print(dist_euclid(vect_bajanda, vect_angel))

0.09556458794684385
0.08655737488376432


In [25]:
letra_el_sol = '''
Al tibio amparo de la 214
Se desnudaba mi canción de amor
Llegaba el día, indiscreto y torpe
Y la belleza nos hacía más pobres
Más esclavos de la ronda del reloj
Así pasaron los momentos pocos
Así pasaba la felicidad
Huyendo siempre de mirada de otros
Entretejiendo un universo loco
De caricias, dudas y complicidad
Toma de mí, todo
Bébetelo bien
Hay que ayunar al filo
Del amanecer
Toma de mí, todo
Y todavía más
Hay que esperar un largo
No de claridad
Toma de mí, todo
Cuanto pueda ser
El sol no da de beber'''

In [26]:
vect_el_sol = normalize(get_lyrics_vector(letra_el_sol))
print(dist_euclid(vect_bajanda, vect_hacha))
print(dist_euclid(vect_bajanda, vect_angel))
print(dist_euclid(vect_hacha, vect_angel))
print(dist_euclid(vect_el_sol, vect_angel))
print(dist_euclid(vect_el_sol, vect_bajanda))
print(dist_euclid(vect_el_sol, vect_hacha))


0.09556458794684385
0.08655737488376432
0.08819817731051234
0.09427936900244724
0.0923574646683549
0.10158230278717023


Otra representación ...

In [27]:
def count_bigrams(text):
    text = unidecode(text).lower()
    text = text.replace(" ", "").replace("\n", "")
    what = []
    cnt = []
    for i in range(1, len(text)):
        bi_gram = text[i:i+2]
        if bi_gram in what:
            idx = what.index(bi_gram)
            cnt[idx] += 1
        else:
            what.append(bi_gram)
            cnt.append(1)
    l_cnt = []
    for i in range(len(what)):
        l_cnt.append((cnt[i], what[i]))
    l_cnt.sort(reverse=True)
    return l_cnt

In [28]:
bigrams_hacha = count_bigrams(letra_hacha)
bigrams_hacha[:10]

[(19, 'ha'),
 (15, 'ac'),
 (14, 'le'),
 (14, 'la'),
 (13, 'ch'),
 (11, 'al'),
 (11, 'ae'),
 (10, 'ta'),
 (10, 'es'),
 (10, 'er')]

In [29]:
bigrams_angel = count_bigrams(letra_angel)
bigrams_angel[:10]

[(18, 'en'),
 (14, 'el'),
 (12, 'ue'),
 (11, 'er'),
 (10, 'an'),
 (9, 'ra'),
 (9, 'qu'),
 (9, 'le'),
 (9, 'al'),
 (8, 're')]

Tarea de ML?

## Para ser real, buscar bases de datos en internet

In [30]:
file_name = "/mnt/hdd/__Docencia/DataAnalysisWithPython/!!2023SepUH/lectures/large_datasets/lyrics/lyrics-data.csv"
!ls -l --block-size=M {file_name}

-rw-rw-r-- 1 milton milton 415M mar 17  2022 '/mnt/hdd/__Docencia/DataAnalysisWithPython/!!2023SepUH/lectures/large_datasets/lyrics/lyrics-data.csv'


In [31]:
import pandas as pd
data = pd.read_csv(file_name)

In [32]:
data.shape

(379931, 5)

In [33]:
data.head(5)

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


In [34]:
data.groupby('language')['Lyric'].count()

language
af         19
ar          4
ca         13
cs          3
cy         23
da         13
de        844
en     191814
es       9917
et         13
eu          4
fa          1
fi        145
fr       1225
ga         32
gd          4
gl         36
hmn         1
hr          1
ht          5
hu          2
id         26
is         86
it       1432
iw          1
ja          7
jw          2
ko         17
ku          3
lg          2
lv          1
mg          3
ms          8
nl         14
no         89
ny          3
pl         47
pt     157393
ro         97
ru          4
rw       1679
sl          1
sq          1
sr          1
st          7
su         19
sv        112
sw         19
tl         69
tr         32
vi          1
zh          1
Name: Lyric, dtype: int64

Ideas:
- Clasificar automaticamente el idioma

Para identificar el género de una canción, utilizaremos otro fichero de la misma base de datos

In [35]:
import csv
with open("/mnt/hdd/__Docencia/DataAnalysisWithPython/!!2023SepUH/lectures/large_datasets/lyrics/artists-data.csv") as f:
    reader = csv.reader(f)
    raw_data = [r for r in reader]
raw_data[:5]

[['Artist', 'Genres', 'Songs', 'Popularity', 'Link'],
 ['Ivete Sangalo', 'Pop; Axé; Romântico', '313', '4.4', '/ivete-sangalo/'],
 ['Chiclete com Banana', 'Axé', '268', '3.8', '/chiclete-com-banana/'],
 ['Banda Eva', 'Axé; Romântico; Reggae', '215', '2.3', '/banda-eva/'],
 ['É O Tchan', 'Axé', '129', '1.6', '/e-o-tchan/']]

Noten, el género no está en la canción, sino en el autor. Cada autor tiene muchos géneros. Cómo procesamos esto entonces?

In [37]:
"aaaa, bbb, ccc".split(",")

['aaaa', ' bbb', ' ccc']

In [38]:
data = []
for r in raw_data[1:]:
    artist, genres_raw, _, popularity, _ = r
    if popularity == "NA":
        popularity = 0
    genres_raw = genres_raw.split(";")
    genres = []
    for g in genres_raw:
        genres.append(g.strip().lower())
    data.append((float(popularity), artist, genres))
data[:5]

[(4.4, 'Ivete Sangalo', ['pop', 'axé', 'romântico']),
 (3.8, 'Chiclete com Banana', ['axé']),
 (2.3, 'Banda Eva', ['axé', 'romântico', 'reggae']),
 (1.6, 'É O Tchan', ['axé']),
 (1.5, 'Claudia Leitte', ['pop', 'axé', 'romântico'])]

Cuál es el género mas común?

In [39]:
generos = []
count = []
for _, _, genres in data:
    for g in genres:
        if g in generos:
            idx = generos.index(g)
            count[idx] += 1
        else:
            generos.append(g)
            count.append(1)

In [40]:
together = []
for idx in range(len(generos)):
    together.append((count[idx], generos[idx]))
together.sort(reverse=True)

In [41]:
together[:10]

[(726, 'rock'),
 (590, 'pop'),
 (562, 'romântico'),
 (557, 'gospel/religioso'),
 (409, 'pop/rock'),
 (325, 'hip hop'),
 (306, 'rap'),
 (297, 'sertanejo'),
 (291, 'indie'),
 (227, 'mpb')]

Cuáles generos son mas populares?

In [42]:
def count_genres(data):
    generos = []
    count = []
    for _, _, genres in data:
        for g in genres:
            if g in generos:
                idx = generos.index(g)
                count[idx] += 1
            else:
                generos.append(g)
                count.append(1)
    together = []
    for idx in range(len(generos)):
        together.append((count[idx], generos[idx]))
    together.sort(reverse=True)
    return together

In [43]:
# Valores de la popularidad
max_popularity = 0
for pop, *_ in data:
    max_popularity = max(max_popularity, float(pop))
max_popularity

205.5

In [45]:
# Definamos un umbral de popularidad
threshold = 40
count = 0
for pop, *_ in data:
    if pop > threshold:
        count += 1
count, len(data)

(37, 4168)

In [46]:
count_genres(data)[:5]

[(726, 'rock'),
 (590, 'pop'),
 (562, 'romântico'),
 (557, 'gospel/religioso'),
 (409, 'pop/rock')]

In [47]:
populares = []
for pop, artist, genres in data:
    if pop > 20:
        populares.append((pop, artist, genres))
count_genres(populares)[:5]

[(24, 'pop'), (23, 'rock'), (21, 'pop/rock'), (19, 'romântico'), (10, 'dance')]

Otras ideas para hacer machine learning con estos datos???