<a href="https://colab.research.google.com/github/kevgam/CAS_IE_Information_Retrieval/blob/main/IR_Test_NEU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Vorbereitung des Datensatzes in Spark

Unser ursprünglicher Spotify-Datensatz, der über [Kaggle](https://www.kaggle.com/discussions/accomplishments/522912) verfügbar ist, umfasste fast eine Million Datensätze. Aufgrund der Grösse des Datensatzes hatten wir bei der Verarbeitung Performanceprobleme. Daher haben wir den Datensatz zunächst in der Spark-Umgebung der ZHAW vorverarbeitet.


Nach dem Standardlogin (inkl. sc.stop() am Schluss) gemäss Anleitung der ZHAW haben wir folgende Schritte ausgeführt:

### Installation und Test der Umgebung

```python
# Installation des notwendigen Pakets
sparky.installpackage('langdetect')

# Alternativ mit pip
pip install langdetect

# Test der RDD-Funktionalität
import os
liste = range(16)
rdd = sc.parallelize(liste)
print(rdd.collect())
print(rdd.glom().collect())

# Überprüfen, ob alle Worker die notwendige Software installiert haben
if len(list(filter(lambda x: x == [], rdd.glom().collect()))):
    raise SystemExit("Nicht gut - einige Worker bleiben ohne Softwareinstallation.")

# Testfunktion für Abhängigkeiten
def testdep(ignore_arg):
    ip = "160.85.252.66"  # Beispiel-IP
    try:
        import lxml
    except:
        return f"lxml FAILED! @ {ip}"
    else:
        return f"lxml worked @ {ip}"

# Installation von Abhängigkeiten
import subprocess
def installdeps(ignore_arg):
    p = subprocess.run("pip install lxml", shell=True, stdout=subprocess.PIPE)
    return p.stdout.decode()

# Ausführen der Installation und Tests
rdd.map(installdeps).collect()
rdd.map(testdep).collect()


### Laden und Verarbeiten der Daten

```python
# Das File wurde vorgängig in unseren Ordner auf dem Server kopiert
filepath = 'songs_with_attributes_and_lyrics.csv'

# Laden der CSV-Datei
import pandas as pd
dfs = pd.read_csv(filepath)

# Spark DataFrame laden
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
dfs = spark.read.csv(filepath, header=True, inferSchema=True)


### Sprache erkennen und Fortschritt protokollieren

```python
from langdetect import detect

# Funktion zur Erkennung der Sprache mit Fortschrittsanzeige
def detect_language_with_progress(partition):
    total_rows = 0
    for row in partition:
        try:
            lang = detect(row['lyrics'])
            yield (row['lyrics'], lang)  # Rückgabe: Originaltext und erkannte Sprache
        except Exception:
            yield (row['lyrics'], 'unknown')
        total_rows += 1
        if total_rows % 1000 == 0:  # Fortschritt alle 1000 Zeilen anzeigen
            print(f"Processed {total_rows} rows in this partition")

# RDD-Transformationen anwenden
rdd = dfs.rdd.mapPartitions(detect_language_with_progress)

# Zurück in ein DataFrame umwandeln
schema = StringType()
result = rdd.toDF(["lyrics", "lyrics_language"])

# Fortschritt anzeigen
result.show()


### Ergebnisse speichern

```python
# Als Excel-Datei speichern
output_path = './processed_songs_with_lyrics.xlsx'
dfs.to_excel(output_path, index=False)
print(f"DataFrame saved to: {output_path}")

# Als CSV-Datei speichern
csv_output_path = './processed_songs_with_lyrics.csv'
dfs.to_csv(csv_output_path, index=False)
print(f"DataFrame saved as CSV file to: {csv_output_path}")


### Filterung und Speicherung der englischen Texte

```python
# Zeilen filtern, in denen die Sprache Englisch ist
dfs_en = dfs[dfs['lyrics_language'] == 'en']

# Gefilterte Daten als Excel- und CSV-Datei speichern
dfs_en.to_excel('./processed_songs_filtered_lyrics_en.xlsx', index=False)
dfs_en.to_csv('./processed_songs_filtered_lyrics_en.csv', index=False)

print("Filtered DataFrame saved as 'filtered_lyrics_en.xlsx' and 'filtered_lyrics_en.csv'")


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Herausfiltern der Datensätze ohne Albumnamen

```python
# Installation pandas und openpyxl
!pip install pandas openpyxl

import pandas as pd

# Laden der CSV-Datei
file_path = "/content/drive/MyDrive/ie_information_retrieval_dataset/processed_songs_filtered_lyrics_en.csv"
df = pd.read_csv(file_path)

# Filtern aller Zeilen, bei denen 'album_name' nicht NaN ist
df_filtered = df.dropna(subset=['album_name'])

# Speichern der Datei als CSV
output_file_path = '/content/drive/MyDrive/ie_information_retrieval_dataset/processed_songs_filtered_lyrics_with_album_name.csv'
df_filtered.to_csv(output_file_path, index=False)


In [6]:
# Installation der benötigten Bibliotheken
!pip install pandas openpyxl

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [5]:


# Lade die Excel-Datei
file_path = "/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_filtered_lyrics_en.xlsx"
df = pd.read_excel(file_path)

# Entferne die Spalten 'id' und 'album_name'
df = df.drop(columns=['id', 'album_name', 'lyrics_language'], errors='ignore')

# Entferne alle Datensätze, bei denen 'duration_ms' kleiner als 240000 (4 Minuten) oder größer als 300000 (5 Minuten) ist
df = df[(df['duration_ms'] >= 240000) & (df['duration_ms'] <= 300000)]

# Entferne Sonderzeichen in den Spalten 'name' und 'artists'
# Sonderzeichen inkl. [] werden entfernt
df['name'] = df['name'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df['artists'] = df['artists'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))

# Filtere alle Zeilen, bei denen 'name' NaN ist
df = df.dropna(subset=['name'])

# Filtere alle Zeilen, bei denen 'artists' NaN ist
df = df.dropna(subset=['artists'])

# Entferne Duplikate basierend auf der Kombination aus 'name' und 'artists'
df = df.drop_duplicates(subset=['name', 'artists'])

# Sicherstellen, dass die Spalte 'lyrics' nur String-Werte enthält
df = df[df['lyrics'].apply(lambda x: isinstance(x, str))]

# Entferne Zeilenumbrüche innerhalb der 'lyrics' Spalte
df['lyrics'] = df['lyrics'].apply(lambda x: str(x).replace('\n', ' ').replace('\r', ' ') if isinstance(x, str) else x)

# Setze die 'lyrics'-Spalte in Anführungszeichen
df['lyrics'] = df['lyrics'].apply(lambda x: f'"{x}"' if isinstance(x, str) else x)

# Entferne Duplikate basierend auf der Kombination aus 'duration_ms' und 'lyrics'
df = df.drop_duplicates(subset=['duration_ms', 'lyrics'])

# Speichere die bereinigten Daten als CSV-Datei
output_csv_path = '/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_filtered_lyrics_bereinigt.csv'
df.to_csv(output_csv_path, index=False)

print(f"Bereinigte CSV-Datei wurde gespeichert unter: {output_csv_path}")


[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

In [None]:
# NLTK-Resourcen herunterladen
nltk.download('punkt')
nltk.download('stopwords')

# DataFrame laden
df = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_filtered_lyrics_bereinigt.csv')

# Sicherstellen, dass die Spalte 'lyrics' keine fehlenden Werte enthält und Strings sind
df['lyrics'] = df['lyrics'].fillna('').astype(str)

# Funktion zur Textbereinigung
def preprocess_text(text):
    if not isinstance(text, str):  # Sicherstellen, dass die Eingabe ein String ist
        text = str(text)
    tokens = word_tokenize(text.lower())  # Tokenisierung & Kleinschreibung
    tokens = [word for word in tokens if word.isalpha()]  # Sonderzeichen/Zahlen entfernen
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Stopwörter entfernen
    return tokens  # Rückgabe als Liste von Tokens

# Bereinigte Tokens in neuer Spalte speichern
df['clean_lyrics'] = df['lyrics'].apply(preprocess_text)

# Ergebnisse speichern
df.to_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_without_stopwords.csv', index=False)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# DataFrame laden
df = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_without_stopwords.csv')

# Entferne Duplikate in 'clean_lyrics'
df = df.drop_duplicates(['clean_lyrics'])

# Filtere alle Zeilen, bei denen 'clean_lyrics' NaN ist
df = df.dropna(subset=['clean_lyrics'])

# Ergebnisse speichern
df.to_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_without_duplicates.csv', index=False)




In [8]:
display(df)

Unnamed: 0,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,lyrics,clean_lyrics
0,B Movie Box Car Blues Live,The Blues Brothers Joe Gastwirt,0.606,0.819,9,-8.281,1,0.0696,0.564000,0.004040,0.9590,0.5040,93.307,244400,"""Caught a ride into South Dakota With two gir...","['caught', 'ride', 'south', 'dakota', 'two', '..."
1,BabyRock Rock,Clorofila,0.692,0.900,8,-7.059,1,0.0279,0.014300,0.432000,0.1090,0.9510,126.038,280107,"""I can make a choice and never really have a ...","['make', 'choice', 'never', 'really', 'doubt',..."
2,Boom,MY FIRST STORY,0.340,0.978,A,-3.785,Major,0.2090,0.000055,0.000000,0.1990,0.1920,199.918,256000,"""Instrumental All in this whole world now has...","['instrumental', 'whole', 'world', 'already', ..."
3,Dearly Departed Live from Spotify Sxsw 2014 f...,Shakey Graves,0.561,0.491,7.0,-8.812,1.0,0.1210,0.293000,0.000001,0.6850,0.5780,81.138,252798,"""ooo0o0oo0oo0o... ooo0o0oo0oo0o... ooo0o0oo0...","['well', 'know', 'house', 'haunted', 'yeah', '..."
4,E Train,Jonny Lang,0.685,0.771,9,-7.671,1,0.0296,0.000522,0.002370,0.1750,0.7010,112.807,251000,"""Take the ""E"" train babe take the easy train ...","['take', 'e', 'train', 'babe', 'take', 'easy',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142258,caress Me Sweet,Joy Wellboy,0.536,0.613,5,-10.581,0,0.0304,0.086800,0.808000,0.0880,0.7760,159.001,273895,"""Squeeze Caress me sweet and slam the door I...","['squeeze', 'caress', 'sweet', 'slam', 'door',..."
142259,my Heart Ran Away,Joy Wellboy,0.706,0.332,5,-11.015,1,0.0289,0.366000,0.163000,0.0902,0.4250,143.998,271578,"""Looking for it over land and sea My heart ra...","['looking', 'land', 'sea', 'heart', 'ran', 'aw..."
142260,what Baby,Joy Wellboy,0.715,0.544,4,-9.431,0,0.0296,0.028800,0.016300,0.1260,0.3860,122.000,258090,"""When days are short and nights are long I fe...","['days', 'short', 'nights', 'long', 'feel', 'n..."
142261,,Strangeways,0.411,0.854,6.0,-5.075,0.0,0.0371,0.052900,0.000095,0.3020,0.5280,91.092,286280,"""You know it's wrong, you know it's right No ...","['know', 'wrong', 'know', 'right', 'sight', 's..."


In [11]:
# Klassifikation nach Emotionen

# Das NRC Emotion Lexicon ist ein von der National Research Council Canada (NRC) entwickeltes Lexikon,
# das englische Wörter und ihre Assoziationen mit acht grundlegenden Emotionen
# (Wut, Angst, Erwartung, Vertrauen, Überraschung, Traurigkeit, Freude und Ekel)
# sowie zwei Sentiments (negativ und positiv) enthält

# https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm


from collections import Counter

# Lade die Eingabedatei
data = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_without_duplicates.csv')

# Lade das NRC Emotion Lexikon
nrc_lexicon = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', header=None, names=['word', 'emotion', 'value'])  # Enthält Wörter und zugehörige Emotionen
# Nur relevante Einträge
nrc_lexicon = nrc_lexicon[nrc_lexicon['value'] == 1]

# Zuordnung der Emotionen zu den Kategorien
emotion_categories = {
    'anger': ['anger'],
    'fear': ['fear'],
    'anticipation': ['anticipation'],
    'trust': ['trust'],
    'surprise': ['surprise'],
    'sadness': ['sadness'],
    'joy': ['joy'],
    'disgust': ['disgust'],
    'negative': ['negative'],
    'positive': ['positive']
}

# Funktion zur Emotionenzählung und Kategorisierung
def get_emotions_and_category(tokens, lexicon, emotion_categories):
    """
    Zählt Emotionen und kategorisiert sie basierend auf Tokens und NRC-Emotion-Lexikon.

    Args:
        tokens (list): Liste von Tokens aus der Spalte `clean_lyrics`.
        lexicon (DataFrame): Das NRC-Emotion-Lexikon.
        emotion_categories (dict): Mapping der Emotionen zu Kategorien.

    Returns:
        tuple: emotion_counts (Counter), category_counts (dict), sentiment (str)
    """
    if isinstance(tokens, str):  # Wenn Tokens als String vorliegen
        tokens = tokens.split()  # Zerlege den String in Tokens

    # Zähle Emotionen für die Tokens, die im Lexikon vorkommen
    emotions = lexicon[lexicon['word'].isin(tokens)]['emotion'].values
    emotion_counts = Counter(emotions)

    # Kategorisierung der Emotionen
    category_counts = {category: sum(emotion_counts[emotion] for emotion in emotions_list)
                       for category, emotions_list in emotion_categories.items()}

    # Klassifikation in Positiv oder Negativ
    is_positive = category_counts['joy'] > category_counts['sadness']
    sentiment = 'positive' if is_positive else 'negative'

    return emotion_counts, category_counts, sentiment

# Anwenden der Funktion auf die Spalte `clean_lyrics`
data['emotion_counts'], data['category_counts'], data['sentiment'] = zip(*data['clean_lyrics'].apply(lambda x: get_emotions_and_category(str(x), nrc_lexicon, emotion_categories)))

# Optional: Ergebnisse zur Übersicht ausgeben
print(data[['clean_lyrics', 'emotion_counts', 'category_counts', 'sentiment']].head())


# Ergebnis speichern
output_csv_path = '/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_with_emotions.csv'
data.to_csv(output_csv_path, index=False)

                                        clean_lyrics emotion_counts  \
0  ['caught', 'ride', 'south', 'dakota', 'two', '...             {}   
1  ['make', 'choice', 'never', 'really', 'doubt',...             {}   
2  ['instrumental', 'whole', 'world', 'already', ...             {}   
3  ['well', 'know', 'house', 'haunted', 'yeah', '...             {}   
4  ['take', 'e', 'train', 'babe', 'take', 'easy',...             {}   

                                     category_counts sentiment  
0  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
1  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
2  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
3  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
4  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  


In [12]:
from collections import Counter
import pandas as pd

# Lade die Eingabedatei
data = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_without_duplicates.csv')

# Überprüfen, ob die Spalte `clean_lyrics` korrekt ist
if 'clean_lyrics' not in data.columns:
    raise ValueError("Die Spalte 'clean_lyrics' ist nicht in den Daten enthalten. Bitte prüfen Sie die Eingabedatei.")

# Lade das NRC Emotion Lexikon
nrc_lexicon = pd.read_csv(
    '/content/drive/MyDrive/ie_scripting_datasets/Archive/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt',
    sep='\t',
    header=None,
    names=['word', 'emotion', 'value']
)

# Nur relevante Einträge behalten
nrc_lexicon = nrc_lexicon[nrc_lexicon['value'] == 1]

# Zuordnung der Emotionen zu den Kategorien
emotion_categories = {
    'anger': ['anger'],
    'fear': ['fear'],
    'anticipation': ['anticipation'],
    'trust': ['trust'],
    'surprise': ['surprise'],
    'sadness': ['sadness'],
    'joy': ['joy'],
    'disgust': ['disgust']
}

# Funktion zur Emotionenzählung und Kategorisierung
def get_emotions_and_category(tokens, lexicon, emotion_categories):
    """
    Zählt Emotionen und kategorisiert sie basierend auf Tokens und NRC-Emotion-Lexikon.

    Args:
        tokens (str or list): Tokens oder Text aus der Spalte `clean_lyrics`.
        lexicon (DataFrame): Das NRC-Emotion-Lexikon.
        emotion_categories (dict): Mapping der Emotionen zu Kategorien.

    Returns:
        tuple: emotion_counts (Counter), category_counts (dict), sentiment (str)
    """
    if isinstance(tokens, str):  # Wenn Tokens als String vorliegen
        tokens = tokens.split()  # Zerlege den String in Tokens

    # Zähle Emotionen für die Tokens, die im Lexikon vorkommen
    emotions = lexicon[lexicon['word'].isin(tokens)]['emotion'].values
    emotion_counts = Counter(emotions)

    # Kategorisierung der Emotionen
    category_counts = {
        category: sum(emotion_counts[emotion] for emotion in emotions_list)
        for category, emotions_list in emotion_categories.items()
    }

    # Klassifikation in Positiv oder Negativ
    is_positive = category_counts.get('joy', 0) > category_counts.get('sadness', 0)
    sentiment = 'positive' if is_positive else 'negative'

    return emotion_counts, category_counts, sentiment

# Anwenden der Funktion auf die Spalte `clean_lyrics`
results = data['clean_lyrics'].apply(
    lambda x: get_emotions_and_category(str(x), nrc_lexicon, emotion_categories)
)

# Ergebnisse extrahieren und zu Spalten hinzufügen
data['emotion_counts'] = results.map(lambda x: x[0])
data['category_counts'] = results.map(lambda x: x[1])
data['sentiment'] = results.map(lambda x: x[2])

# Optional: Ergebnisse zur Übersicht ausgeben
print(data[['clean_lyrics', 'emotion_counts', 'category_counts', 'sentiment']].head())

# Ergebnis speichern
output_csv_path = '/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_with_emotions.csv'
data.to_csv(output_csv_path, index=False)
print(f"Ergebnis gespeichert: {output_csv_path}")


                                        clean_lyrics emotion_counts  \
0  ['caught', 'ride', 'south', 'dakota', 'two', '...             {}   
1  ['make', 'choice', 'never', 'really', 'doubt',...             {}   
2  ['instrumental', 'whole', 'world', 'already', ...             {}   
3  ['well', 'know', 'house', 'haunted', 'yeah', '...             {}   
4  ['take', 'e', 'train', 'babe', 'take', 'easy',...             {}   

                                     category_counts sentiment  
0  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
1  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
2  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
3  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
4  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
Ergebnis gespeichert: /content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_with_emotions.csv


In [13]:
from collections import Counter
import pandas as pd
import ast

# Lade die Eingabedatei
data = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_without_duplicates.csv')

# Lade das NRC Emotion Lexikon
nrc_lexicon = pd.read_csv(
    '/content/drive/MyDrive/ie_scripting_datasets/Archive/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt',
    sep='\t',
    header=None,
    names=['word', 'emotion', 'value']
)

# Nur relevante Einträge behalten
nrc_lexicon = nrc_lexicon[nrc_lexicon['value'] == 1]

# Zuordnung der Emotionen zu den Kategorien
emotion_categories = {
    'anger': ['anger'],
    'fear': ['fear'],
    'anticipation': ['anticipation'],
    'trust': ['trust'],
    'surprise': ['surprise'],
    'sadness': ['sadness'],
    'joy': ['joy'],
    'disgust': ['disgust']
}

# Funktion zur Emotionenzählung und Kategorisierung
def get_emotions_and_category(tokens, lexicon, emotion_categories):
    """
    Zählt Emotionen und kategorisiert sie basierend auf Tokens und NRC-Emotion-Lexikon.

    Args:
        tokens (str or list): Tokens aus der Spalte `clean_lyrics`.
        lexicon (DataFrame): Das NRC-Emotion-Lexikon.
        emotion_categories (dict): Mapping der Emotionen zu Kategorien.

    Returns:
        tuple: emotion_counts (Counter), category_counts (dict), sentiment (str)
    """
    if isinstance(tokens, str):  # Wenn Tokens als String vorliegen
        tokens = ast.literal_eval(tokens)  # Konvertiere den String in eine echte Liste

    # Zähle Emotionen für die Tokens, die im Lexikon vorkommen
    emotions = lexicon[lexicon['word'].isin(tokens)]['emotion'].values
    emotion_counts = Counter(emotions)

    # Kategorisierung der Emotionen
    category_counts = {
        category: sum(emotion_counts[emotion] for emotion in emotions_list)
        for category, emotions_list in emotion_categories.items()
    }

    # Klassifikation in Positiv oder Negativ
    is_positive = category_counts.get('joy', 0) > category_counts.get('sadness', 0)
    sentiment = 'positive' if is_positive else 'negative'

    return emotion_counts, category_counts, sentiment

# Anwenden der Funktion auf die Spalte `clean_lyrics`
results = data['clean_lyrics'].apply(
    lambda x: get_emotions_and_category(x, nrc_lexicon, emotion_categories)
)

# Ergebnisse extrahieren und zu Spalten hinzufügen
data['emotion_counts'] = results.map(lambda x: x[0])
data['category_counts'] = results.map(lambda x: x[1])
data['sentiment'] = results.map(lambda x: x[2])

# Optional: Ergebnisse zur Übersicht ausgeben
print(data[['clean_lyrics', 'emotion_counts', 'category_counts', 'sentiment']].head())

# Ergebnis speichern
output_csv_path = '/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_with_emotions.csv'
data.to_csv(output_csv_path, index=False)
print(f"Ergebnis gespeichert: {output_csv_path}")


                                        clean_lyrics  \
0  ['caught', 'ride', 'south', 'dakota', 'two', '...   
1  ['make', 'choice', 'never', 'really', 'doubt',...   
2  ['instrumental', 'whole', 'world', 'already', ...   
3  ['well', 'know', 'house', 'haunted', 'yeah', '...   
4  ['take', 'e', 'train', 'babe', 'take', 'easy',...   

                                      emotion_counts  \
0  {'sadness': 2, 'joy': 2, 'positive': 4, 'negat...   
1  {'positive': 5, 'anticipation': 4, 'fear': 5, ...   
2  {'joy': 4, 'positive': 9, 'trust': 7, 'anger':...   
3  {'anticipation': 3, 'joy': 3, 'positive': 5, '...   
4  {'joy': 2, 'positive': 5, 'anticipation': 2, '...   

                                     category_counts sentiment  
0  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...  negative  
1  {'anger': 3, 'fear': 5, 'anticipation': 4, 'tr...  negative  
2  {'anger': 1, 'fear': 3, 'anticipation': 6, 'tr...  positive  
3  {'anger': 2, 'fear': 7, 'anticipation': 3, 'tr...  negative  
4

In [24]:
from collections import Counter
import pandas as pd
import ast

# Lade die Eingabedatei
data = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_without_duplicates.csv')

# Lade das NRC Emotion Lexikon
nrc_lexicon = pd.read_csv(
    '/content/drive/MyDrive/ie_scripting_datasets/Archive/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt',
    sep='\t',
    header=None,
    names=['word', 'emotion', 'value']
)

# Nur relevante Einträge behalten
nrc_lexicon = nrc_lexicon[nrc_lexicon['value'] == 1]

# Zuordnung der Emotionen zu den Kategorien
emotion_categories = {
    'anger': ['anger'],
    'fear': ['fear'],
    'anticipation': ['anticipation'],
    'trust': ['trust'],
    'surprise': ['surprise'],
    'sadness': ['sadness'],
    'joy': ['joy'],
    'disgust': ['disgust']
}

# Berechnung der Sentiment-Ratio
def calculate_sentiment_ratio(row):
    category_counts = eval(row['category_counts'])  # String in Dictionary umwandeln
    positive = category_counts.get('positive', 0)
    negative = category_counts.get('negative', 0)
    if positive + negative > 0:
        return positive / (positive + negative)
    return 0  # Verhindert Division durch Null

# Spalte hinzufügen
data['sentiment_ratio'] = data.apply(calculate_sentiment_ratio, axis=1)



# Funktion zur Emotionenzählung und Kategorisierung
def get_emotions_and_category(tokens, lexicon, emotion_categories):
    if isinstance(tokens, str):  # Wenn Tokens als String vorliegen
        tokens = ast.literal_eval(tokens)  # Konvertiere den String in eine echte Liste

    # Zähle Emotionen für die Tokens, die im Lexikon vorkommen
    emotions = lexicon[lexicon['word'].isin(tokens)]['emotion'].values
    emotion_counts = Counter(emotions)

    # Kategorisierung der Emotionen
    category_counts = {
        category: sum(emotion_counts[emotion] for emotion in emotions_list)
        for category, emotions_list in emotion_categories.items()
    }

    # Klassifikation in Positiv oder Negativ
    is_positive = category_counts.get('joy', 0) > category_counts.get('sadness', 0)
    sentiment = 'positive' if is_positive else 'negative'

    return emotion_counts, category_counts, sentiment

# Anwenden der Funktion auf die Spalte `clean_lyrics`
results = data['clean_lyrics'].apply(
    lambda x: get_emotions_and_category(x, nrc_lexicon, emotion_categories)
)

# Ergebnisse extrahieren und zu Spalten hinzufügen
data['emotion_counts'] = results.map(lambda x: x[0])
data['category_counts'] = results.map(lambda x: x[1])
data['sentiment'] = results.map(lambda x: x[2])

# Funktion zur primären Emotionserkennung
def determine_primary_emotion(category_counts):
    """
    Bestimmt die primäre Emotion basierend auf den Kategoriezählungen.

    Args:
        category_counts (dict): Dictionary mit Emotionenkategorien und deren Zählungen.

    Returns:
        str: Primäre Emotion (Kategorie) oder 'neutral', falls keine Emotion dominiert.
    """
    if not category_counts:
        return 'neutral'
    # Finde die Kategorie mit der höchsten Zählung
    primary_emotion = max(category_counts, key=category_counts.get)
    # Wenn alle Werte 0 sind, return 'neutral'
    if category_counts[primary_emotion] == 0:
        return 'neutral'
    return primary_emotion

# Primäre Emotion bestimmen und als neue Spalte hinzufügen
data['primary_emotion'] = data['category_counts'].apply(determine_primary_emotion)

# Songs nach Emotionen aufteilen
emotion_groups = {
    emotion: data[data['primary_emotion'] == emotion]
    for emotion in emotion_categories.keys()
}

# Ergebnisse speichern (optional)
for emotion, group in emotion_groups.items():
    group.to_csv(f'/content/drive/MyDrive/ie_scripting_datasets/Archive/songs_emotion.csv', index=False)

# Übersicht ausgeben
print(data[['clean_lyrics', 'category_counts', 'primary_emotion']].head())
print(f"Songs wurden nach Emotionen aufgeteilt und gespeichert.")

# Ergebnis speichern
output_csv_path = '/content/drive/MyDrive/ie_scripting_datasets/Archive/songs_with_emotions_category.csv'
data.to_csv(output_csv_path, index=False)
print(f"Ergebnis gespeichert: {output_csv_path}")


KeyError: 'category_counts'

In [26]:
from collections import Counter
import pandas as pd
import ast

# Lade die Eingabedatei
data = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/processed_songs_without_duplicates.csv')

# Lade das NRC Emotion Lexikon
nrc_lexicon = pd.read_csv(
    '/content/drive/MyDrive/ie_scripting_datasets/Archive/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt',
    sep='\t',
    header=None,
    names=['word', 'emotion', 'value']
)
nrc_lexicon = nrc_lexicon[nrc_lexicon['value'] == 1]

# Emotionenkategorien
emotion_categories = {
    'anger': ['anger'],
    'fear': ['fear'],
    'anticipation': ['anticipation'],
    'trust': ['trust'],
    'surprise': ['surprise'],
    'sadness': ['sadness'],
    'joy': ['joy'],
    'disgust': ['disgust']
}

# Funktion zur Emotionenzählung und Kategorisierung
def get_emotions_and_category(tokens, lexicon, emotion_categories):
    if isinstance(tokens, str):  # Wenn Tokens als String vorliegen
        tokens = ast.literal_eval(tokens)  # Konvertiere den String in eine echte Liste

    emotions = lexicon[lexicon['word'].isin(tokens)]['emotion'].values
    emotion_counts = Counter(emotions)
    category_counts = {
        category: sum(emotion_counts[emotion] for emotion in emotions_list)
        for category, emotions_list in emotion_categories.items()
    }
    is_positive = category_counts.get('joy', 0) > category_counts.get('sadness', 0)
    sentiment = 'positive' if is_positive else 'negative'
    return emotion_counts, category_counts, sentiment

# Anwenden der Funktion
results = data['clean_lyrics'].apply(
    lambda x: get_emotions_and_category(x, nrc_lexicon, emotion_categories)
)

# Ergebnisse extrahieren
data['emotion_counts'] = results.map(lambda x: x[0])
data['category_counts'] = results.map(lambda x: x[1])
data['sentiment'] = results.map(lambda x: x[2])

# Sentiment-Ratio berechnen
def calculate_sentiment_ratio(row):
    # Wenn row['category_counts'] ein String ist, konvertiere es in ein Dictionary
    if isinstance(row['category_counts'], str):
        category_counts = ast.literal_eval(row['category_counts'])
    else:
        category_counts = row['category_counts']  # Bereits ein Dictionary

    positive = category_counts.get('joy', 0)  # Kategorie "joy" als positiv betrachten
    negative = category_counts.get('sadness', 0)  # Kategorie "sadness" als negativ betrachten

    if positive + negative > 0:
        return positive / (positive + negative)
    return 0

data['sentiment_ratio'] = data.apply(calculate_sentiment_ratio, axis=1)


# Primäre Emotion bestimmen
def determine_primary_emotion(category_counts):
    if isinstance(category_counts, str):
        category_counts = ast.literal_eval(category_counts)
    if not category_counts:
        return 'neutral'
    primary_emotion = max(category_counts, key=category_counts.get)
    if category_counts[primary_emotion] == 0:
        return 'neutral'
    return primary_emotion
data['primary_emotion'] = data['category_counts'].apply(determine_primary_emotion)

# Songs nach Emotionen speichern
emotion_groups = {
    emotion: data[data['primary_emotion'] == emotion]
    for emotion in emotion_categories.keys()
}
for emotion, group in emotion_groups.items():
    group.to_csv(f'/content/drive/MyDrive/ie_scripting_datasets/Archive/songs_{emotion}.csv', index=False)

# Übersicht ausgeben
print(data[['clean_lyrics', 'category_counts', 'primary_emotion']].head())

# Gesamtergebnisse speichern
output_csv_path = '/content/drive/MyDrive/ie_scripting_datasets/Archive/songs_with_emotions_category.csv'
data.to_csv(output_csv_path, index=False)
print(f"Ergebnis gespeichert: {output_csv_path}")


                                        clean_lyrics  \
0  ['caught', 'ride', 'south', 'dakota', 'two', '...   
1  ['make', 'choice', 'never', 'really', 'doubt',...   
2  ['instrumental', 'whole', 'world', 'already', ...   
3  ['well', 'know', 'house', 'haunted', 'yeah', '...   
4  ['take', 'e', 'train', 'babe', 'take', 'easy',...   

                                     category_counts primary_emotion  
0  {'anger': 0, 'fear': 0, 'anticipation': 0, 'tr...         sadness  
1  {'anger': 3, 'fear': 5, 'anticipation': 4, 'tr...            fear  
2  {'anger': 1, 'fear': 3, 'anticipation': 6, 'tr...           trust  
3  {'anger': 2, 'fear': 7, 'anticipation': 3, 'tr...            fear  
4  {'anger': 0, 'fear': 0, 'anticipation': 2, 'tr...    anticipation  
Ergebnis gespeichert: /content/drive/MyDrive/ie_scripting_datasets/Archive/songs_with_emotions_category.csv


In [22]:
import pandas as pd

# Datei laden
df = pd.read_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/songs_with_emotions_category.csv')

# Nur die gewünschten Spalten auswählen
selected_columns = df[['name', 'artists', 'sentiment', 'primary_emotion']]

# Neue Datei speichern
selected_columns.to_csv('/content/drive/MyDrive/ie_scripting_datasets/Archive/only_emotions_category.csv.csv', index=False)

