## Imports

In [2]:
%pip install -r requirements.txt
# Load, explore and plot data
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
import re

######## SEED setzen
import random
# Random Seed setzen:
random.seed(49)
np.random.seed(49)
SEED = 49

Note: you may need to restart the kernel to use updated packages.


## Load Data

In [3]:
lyrics_df = pd.read_csv('lyrics-data.csv')
lyrics_df.head(2)

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt


In [4]:
artist_df = pd.read_csv('artists-data.csv')
artist_df.head(2)

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/


## Merge Data

In [5]:
# Zusammenführen der DataFrames anhand der 'ALink' Spalte in lyrics_df und der 'Link' Spalte in artist_df
df = pd.merge(lyrics_df, artist_df, left_on='ALink', right_on='Link')

# Anzeigen der ersten paar Zeilen des resultierenden DataFrames
df.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language,Artist,Genres,Songs,Popularity,Link
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/


In [6]:
df.nunique()

ALink           4161
SName         266677
SLink         378949
Lyric         370290
language          51
Artist          4161
Genres          1747
Songs            406
Popularity       216
Link            4161
dtype: int64

In [7]:
df["Genres"].value_counts()

Genres
Gospel/Religioso                            20095
Sertanejo                                   10847
Rock                                         5870
Romântico                                    5233
Forró                                        5114
                                            ...  
Rap; Hip Hop; Infantil                          1
Progressivo; House; Electronica                 1
Pop/Rock; Classic Rock; Gospel/Religioso        1
Rap; Gospel/Religioso                           1
Funk Carioca; Blues                             1
Name: count, Length: 1747, dtype: int64

In [54]:
df["language"].value_counts()

language
en     191387
pt     156941
es       9916
rw       1679
it       1426
fr       1221
de        843
fi        145
sv        112
ro         97
no         89
is         86
tl         69
pl         47
gl         36
ga         32
tr         32
id         26
cy         23
su         19
af         19
sw         19
ko         17
nl         14
da         13
ca         13
et         12
ms          8
ja          7
st          6
ht          5
ar          4
ru          4
eu          4
gd          4
cs          3
ku          3
ny          3
mg          3
lg          2
jw          2
hu          2
iw          1
sr          1
hmn         1
hr          1
vi          1
fa          1
sq          1
zh          1
sl          1
Name: count, dtype: int64

Wir verwenden nur engosche Lieder:

In [55]:
df = df[df["language"] == "en"]

## Data Preprocessing

In [56]:
df = df.drop(["ALink","SLink", "Songs", "Popularity", "Link", "language"], axis=1)
df.rename(columns={'SName': 'Titel'}, inplace=True)
df.head(5)

Unnamed: 0,Titel,Lyric,Artist,Genres
69,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,Ivete Sangalo,Pop; Axé; Romântico
86,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",Ivete Sangalo,Pop; Axé; Romântico
88,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",Ivete Sangalo,Pop; Axé; Romântico
111,Easy,"Know it sounds funny\nBut, I just can't stand ...",Ivete Sangalo,Pop; Axé; Romântico
140,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,Ivete Sangalo,Pop; Axé; Romântico


In [57]:
len(df)

191387

In [58]:
df.isna().sum()

Titel     2
Lyric     0
Artist    0
Genres    9
dtype: int64

Alle Observation (Zeilen) mit nan Werten rausschmeißen (nur Genre wegen Label, Name ist uns egal, da nurnur die L):

In [59]:
df = df.dropna(subset=['Genres'])

In [60]:
df.isna().sum()

Titel     2
Lyric     0
Artist    0
Genres    0
dtype: int64

Entfernen von \n und \t weil dass durch Scrapen entstanden ist, sehe ich schon von 20 Metern auf die ersten Daten:

In [61]:
df['Lyric'] = df['Lyric'].str.replace('\n', ' ', regex=False)
df['Lyric'] = df['Lyric'].str.replace('\t', ' ', regex=False)
df.head(5)

Unnamed: 0,Titel,Lyric,Artist,Genres
69,Careless Whisper,I feel so unsure As I take your hand and lead ...,Ivete Sangalo,Pop; Axé; Romântico
86,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya Or even try to school,...",Ivete Sangalo,Pop; Axé; Romântico
88,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here Don't be co...",Ivete Sangalo,Pop; Axé; Romântico
111,Easy,"Know it sounds funny But, I just can't stand t...",Ivete Sangalo,Pop; Axé; Romântico
140,For Your Babies (The Voice cover),You've got that look again The one I hoped I h...,Ivete Sangalo,Pop; Axé; Romântico


Preprocessing der Daten (lyrics):
- Lowercasing
- Entfernung von Zahlen oder Umwandlung in Text
- Entfernung von Satzzeichen
- Entfernung von Stoppwörtern


In [62]:
# Sicherstellen, dass NLTK-Ressourcen heruntergeladen sind
nltk.download('stopwords')


# Funktion zur Bereinigung des Lyrics-Textes
def clean_lyrics(text):
    # Lowercasing
    text = text.lower()
    # Entfernung von Zahlen
    text = re.sub(r'\d+', '', text)
    # Entfernung von Satzzeichen
    text = re.sub(r'[^\w\s]', '', text)
    # Entfernung von mehrfachen Leerzeichen
    text = re.sub(r'\s+', ' ', text)
    # Entfernung von Stoppwörtern
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Entfernung von Wiederholungen wie "yeah yeah yeah"
    return text

# Anwendung der Bereinigungsfunktion auf die Spalte 'lyrics'
df['Lyric'] = df['Lyric'].apply(clean_lyrics)
df.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ivan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Titel,Lyric,Artist,Genres
69,Careless Whisper,feel unsure take hand lead dance floor music d...,Ivete Sangalo,Pop; Axé; Romântico
86,Could You Be Loved / Citação Musical do Rap: S...,dont let fool ya even try school ya oh weve go...,Ivete Sangalo,Pop; Axé; Romântico
88,Cruisin' (Part. Saulo),baby lets cruise away dont confused way clear ...,Ivete Sangalo,Pop; Axé; Romântico
111,Easy,know sounds funny cant stand pain girl im leav...,Ivete Sangalo,Pop; Axé; Romântico
140,For Your Babies (The Voice cover),youve got look one hoped lad face beaming smil...,Ivete Sangalo,Pop; Axé; Romântico


## Prepairing Datasets

### Datensatz 1:

Alle Songs die mehr als ein Genre zugeordnet sind werden entfernt (Datensatz 1)

In [63]:
single_genre_df = df[df['Genres'].str.count(';') < 1]
single_genre_df.head()

Unnamed: 0,Titel,Lyric,Artist,Genres
1719,I Miss Her,oh lord id like know thinks oh oh lord wanna g...,Olodum,Axé
1766,They Don't Care About Us,skin head dead head everybody gone bad situati...,Olodum,Axé
2586,Loved You Right Away,moment saw minute met wonderful feeling never ...,Carlinhos Brown,Axé
2627,Goodbye Hello,alone time feel blues way inventions invention...,Carlinhos Brown,Axé
2640,Mess In The Freeway,wanna tell wonderful things wanna tell interes...,Carlinhos Brown,Axé


In [64]:
len(single_genre_df)

42760

Plot der genres:

In [65]:
plot_df= single_genre_df.groupby("Genres", axis=0).count().sort_values(by="Lyric",ascending=False)
plot_df.head()

  plot_df= single_genre_df.groupby("Genres", axis=0).count().sort_values(by="Lyric",ascending=False)


Unnamed: 0_level_0,Titel,Lyric,Artist
Genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Country,4774,4774,4774
Rock,4672,4672,4672
Heavy Metal,4393,4394,4394
Indie,4288,4289,4289
Pop,2544,2544,2544


Sortieren nach den mit den meisten werten:

In [66]:

# # Balkendiagramm erstellen
# plt.figure(figsize=(10, 8))
# plt.bar(df['Genres'], df['Lyric'], color='blue')
# plt.xlabel('Genres')
# plt.ylabel('Anzahl der Lyrics')
# plt.title('Anzahl der Lyrics pro Genre')
# plt.xticks(rotation=45)
# plt.tight_layout()  # sorgt dafür, dass die Labels nicht abgeschnitten werden

# # Diagramm anzeigen
# plt.show()

In [67]:
single_genre_df.to_csv("preprocessed_single_label.csv")

### Datensatz 2:

Alle Songs die mehr als ein Genre zugeordnet sind, werden geteilt. Es wird pro Genre eine Copy angelegt (Datensatz 2)

In [68]:
split_genre_df = df.copy()
split_genre_df['Genres'] = split_genre_df['Genres'].str.split(';')
split_genre_df = split_genre_df.explode('Genres')
split_genre_df['Genres'] = split_genre_df['Genres'].str.split('/')
split_genre_df = split_genre_df.explode('Genres')
split_genre_df['Genres'] = split_genre_df['Genres'].str.replace(' ', '', regex=False)

split_genre_df

Unnamed: 0,Titel,Lyric,Artist,Genres
69,Careless Whisper,feel unsure take hand lead dance floor music d...,Ivete Sangalo,Pop
69,Careless Whisper,feel unsure take hand lead dance floor music d...,Ivete Sangalo,Axé
69,Careless Whisper,feel unsure take hand lead dance floor music d...,Ivete Sangalo,Romântico
86,Could You Be Loved / Citação Musical do Rap: S...,dont let fool ya even try school ya oh weve go...,Ivete Sangalo,Pop
86,Could You Be Loved / Citação Musical do Rap: S...,dont let fool ya even try school ya oh weve go...,Ivete Sangalo,Axé
...,...,...,...,...
378986,When The System Has Fallen,sweat heat days end waiting come hear words sp...,Johnny Clegg,Gospel
378986,When The System Has Fallen,sweat heat days end waiting come hear words sp...,Johnny Clegg,Religioso
378987,Woman Be My Country,stand edge day faces melting african rain many...,Johnny Clegg,WorldMusic
378987,Woman Be My Country,stand edge day faces melting african rain many...,Johnny Clegg,Gospel


In [69]:
split_genre_df["Genres"].value_counts()

Genres
Rock            90077
Pop             62970
HeavyMetal      21759
HipHop          20902
Indie           20134
                ...  
Pagode             29
Forró              18
Kizomba             7
ElectroSwing        6
Regional            2
Name: count, Length: 77, dtype: int64

In [70]:
plot_df2= split_genre_df.groupby("Genres", axis=0).count().sort_values(by="Lyric",ascending=False)
plot_df2.head()

  plot_df2= split_genre_df.groupby("Genres", axis=0).count().sort_values(by="Lyric",ascending=False)


Unnamed: 0_level_0,Titel,Lyric,Artist
Genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rock,90077,90077,90077
Pop,62970,62970,62970
HeavyMetal,21758,21759,21759
HipHop,20902,20902,20902
Indie,20133,20134,20134


In [71]:
split_genre_df.to_csv("preprocessed_split_label.csv")