In [1]:
# Configuration
import os
DATA_DIR = './lyrics_lens_data'
DELTA_DIR = os.path.join(DATA_DIR, 'delta_tables')
os.makedirs(DELTA_DIR, exist_ok=True)

MAX_SCRAPED_RECORDS = 100
REQUEST_SLEEP = (1.0, 2.0)  # polite delays (seconds)
SEED_QUERIES = [
    "Adele Hello", "Ed Sheeran Shape of You", "Billie Eilish bad guy", "Coldplay Yellow", "Taylor Swift Love Story",
    "Drake Hotline Bling", "The Weeknd Blinding Lights", "Bruno Mars Uptown Funk"
]
print('Configuration set. Data dir:', DELTA_DIR)

Configuration set. Data dir: ./lyrics_lens_data/delta_tables


In [3]:
# Genius scraper (HTML-based, direct requests, polite delays)
# NOTE: Respect robots.txt and terms of service. This is educational for capstone use.
import requests, time, random
from bs4 import BeautifulSoup
import pandas as pd

def get_request_simple(url, timeout=15):
    headers = {'User-Agent': 'LyricsLensBot/1.0 (+https://example.com)'}
    r = requests.get(url, headers=headers, timeout=timeout)
    r.raise_for_status()
    return r.text

def genius_search_results(query):
    base = "https://genius.com"
    search_url = f"{base}/search?q={requests.utils.quote(query)}"
    html = get_request_simple(search_url)
    soup = BeautifulSoup(html, "html.parser")
    links = []
    for a in soup.select("a.mini_card"):
        href = a.get('href')
        if href and href.startswith(base):
            links.append(href)
    for a in soup.select("a[href]"):
        href = a.get('href')
        if href and href.startswith(base) and href.endswith("-lyrics"):
            links.append(href)
    seen = set(); out = []
    for l in links:
        if l not in seen:
            seen.add(l); out.append(l)
    return out

def scrape_genius_song(url):
    html = get_request_simple(url)
    soup = BeautifulSoup(html, "html.parser")
    title_tag = soup.find("h1", {"class": "header_with_cover_art-primary_info-title"})
    if not title_tag:
        title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else ""
    artist_tag = soup.select_one("a.header_with_cover_art-primary_info-primary_artist")
    artist = artist_tag.get_text(strip=True) if artist_tag else ""
    lyrics_containers = soup.select("div[data-lyrics-container='true']")
    if lyrics_containers:
        lyrics = "\n".join([c.get_text(separator="\n").strip() for c in lyrics_containers])
    else:
        dug = soup.find("div", class_="lyrics")
        if dug:
            lyrics = dug.get_text(separator="\n").strip()
        else:
            divs = soup.find_all("div")
            lyrics = max(divs, key=lambda d: len(d.get_text())).get_text(separator="\n").strip() if divs else ""
    return {"artist": artist, "title": title, "lyrics": lyrics, "url": url}

# Collect songs (up to MAX_SCRAPED_RECORDS)
collected = []
visited = set()
for q in SEED_QUERIES:
    if len(collected) >= MAX_SCRAPED_RECORDS: break
    try:
        results = genius_search_results(q)
    except Exception as e:
        print('Search failed for', q, e); continue
    for url in results:
        if len(collected) >= MAX_SCRAPED_RECORDS: break
        if url in visited: continue
        try:
            time.sleep(random.uniform(*REQUEST_SLEEP))
            song = scrape_genius_song(url)
            if song and song.get('lyrics'):
                collected.append(song); visited.add(url)
                print('Collected:', song['artist'], '-', song['title'])
        except Exception as e:
            print('Failed:', url, e); continue

songs_df = pd.DataFrame(collected)
print('Total songs collected:', len(songs_df))


Collected:  - Dâ€™Elmar Zaya
Collected:  - Bis Hierhin
Collected:  - Losinâ€™ Streak
Collected:  - Love in a Bottle
Collected:  - VOX POPULI
Total songs collected: 5


In [5]:
# Preprocessing: clean lyrics, tokenize, lemmatize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/devanshisharma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/devanshisharma/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/devanshisharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/devanshisharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# --- Text Preprocessing Function ---
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required resources (safe to run multiple times)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Clean, tokenize, remove stopwords, and lemmatize the input lyrics.
    Returns a processed string ready for NLP tasks.
    """
    if not isinstance(text, str) or text.strip() == "":
        return ""
    text = text.replace('\r', ' ').replace('\n', ' ').lower()
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/devanshisharma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/devanshisharma/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/devanshisharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/devanshisharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
songs_df['clean_lyrics'] = songs_df['lyrics'].fillna('').apply(preprocess_text)


In [15]:
!pip install vaderSentiment





In [17]:
!pip install textblob




In [19]:
# Sentiment analysis (VADER + TextBlob)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
analyzer = SentimentIntensityAnalyzer()

def compute_sentiments(text):
    if not text: return {'vader_compound':0.0,'polarity_tb':0.0,'subjectivity_tb':0.0}
    v = analyzer.polarity_scores(text)
    tb = TextBlob(text)
    return {'vader_compound': v['compound'], 'polarity_tb': tb.sentiment.polarity, 'subjectivity_tb': tb.sentiment.subjectivity}

sentiments = songs_df['lyrics'].apply(compute_sentiments).apply(pd.Series)
songs_df = pd.concat([songs_df, sentiments], axis=1)
songs_df[['artist','title','vader_compound','polarity_tb']].head()


Unnamed: 0,artist,title,vader_compound,polarity_tb
0,,Dâ€™Elmar Zaya,-0.9919,-0.041667
1,,Bis Hierhin,-0.9953,0.2
2,,Losinâ€™ Streak,0.9594,0.0685
3,,Love in a Bottle,0.9941,0.196984
4,,VOX POPULI,0.9943,0.096463


In [21]:
# Topic modeling (LDA) and clustering (TF-IDF + KMeans)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import gensim
from gensim import corpora

texts = [doc.split() for doc in songs_df['clean_lyrics'] if doc]
if texts:
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=min(5, max(1, len(dictionary)//10)), passes=10)
    print('LDA topics:'); print(lda.print_topics(num_words=5))

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(songs_df['clean_lyrics'])
svd = TruncatedSVD(n_components=min(50, X.shape[1]-1 if X.shape[1]>1 else 1))
X_reduced = svd.fit_transform(X)
k = min(5, max(1, len(songs_df)//2))
km = KMeans(n_clusters=k, random_state=42)
songs_df['cluster'] = km.fit_predict(X_reduced)
songs_df[['artist','title','cluster']].head()


LDA topics:
[(0, '0.044*"vox" + 0.027*"spoken" + 0.023*"crowd" + 0.015*"got" + 0.013*"hell"'), (1, '0.002*"er" + 0.002*"die" + 0.002*"der" + 0.002*"bi" + 0.002*"da"'), (2, '0.026*"der" + 0.022*"die" + 0.012*"mir" + 0.012*"wieder" + 0.012*"auf"'), (3, '0.042*"bottle" + 0.037*"love" + 0.032*"la" + 0.019*"husk" + 0.016*"another"'), (4, '0.035*"er" + 0.020*"die" + 0.018*"bi" + 0.015*"sie" + 0.013*"da"')]


Unnamed: 0,artist,title,cluster
0,,Dâ€™Elmar Zaya,0
1,,Bis Hierhin,0
2,,Losinâ€™ Streak,1
3,,Love in a Bottle,1
4,,VOX POPULI,1


In [23]:
songs_df.columns



Index(['artist', 'title', 'lyrics', 'url', 'clean_lyrics', 'vader_compound',
       'polarity_tb', 'subjectivity_tb', 'cluster'],
      dtype='object')

In [25]:
# Create lyric length feature if missing
if 'lyric_len' not in songs_df.columns:
    songs_df['lyric_len'] = songs_df['clean_lyrics'].apply(lambda t: len(str(t).split()))


In [29]:
import numpy as np


In [31]:
songs_df['popularity'] = (songs_df['lyric_len'] * 0.01 + (songs_df['vader_compound'] + 1) * 0.5) + np.random.normal(0, 0.1, len(songs_df))
songs_df['popular_label'] = (songs_df['popularity'] > songs_df['popularity'].median()).astype(int)


In [33]:
# Popularity prediction (proxy popularity + classification)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

songs_df['popularity'] = (songs_df['lyric_len'] * 0.01 + (songs_df['vader_compound'] + 1) * 0.5) + np.random.normal(0, 0.1, len(songs_df))
songs_df['popular_label'] = (songs_df['popularity'] > songs_df['popularity'].median()).astype(int)

features = X_reduced
labels = songs_df['popular_label'].values
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

clf_lr = LogisticRegression(max_iter=500)
clf_lr.fit(X_train, y_train)
y_pred_lr = clf_lr.predict(X_test)

clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)

print('Logistic Regression:\n', classification_report(y_test, y_pred_lr))
print('Random Forest:\n', classification_report(y_test, y_pred_rf))

# Spotify lookup (optional)
spotify_template = """# To fetch Spotify URLs, install spotipy and set env vars SPOTIFY_CLIENT_ID and SPOTIFY_CLIENT_SECRET
# Uncomment and run after installing spotipy:
# import spotipy
# from spotipy.oauth2 import SpotifyClientCredentials
# client_id = os.getenv('SPOTIFY_CLIENT_ID'); client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')
# if not client_id or not client_secret: raise ValueError('Set SPOTIFY_CLIENT_ID and SPOTIFY_CLIENT_SECRET')
# sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))
# def spotify_search_track(artist, title):
#     q = f"track:{title} artist:{artist}"
#     res = sp.search(q, type='track', limit=1)
#     items = res.get('tracks', {}).get('items', [])
#     if items: return items[0]['external_urls']['spotify'], items[0]['popularity'], items[0]['id']
#     return None, None, None
# songs_df['spotify_url'], songs_df['spotify_popularity'], songs_df['spotify_id'] = zip(*songs_df.apply(lambda r: spotify_search_track(r['artist'], r['title']), axis=1))
"""
print(spotify_template)


Logistic Regression:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

Random Forest:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

# To fetch Spotify URLs, install spotipy and set env vars SPOTIFY_CLIENT_ID and SPOTIFY_CLIENT_SECRET
# Uncomment and run after installing spotipy:
# import spotipy
# from spotipy.oauth2 import SpotifyClientCredentials
# client_id = os.getenv('SPOTIFY_CLIENT_ID'); client_secret = os.getenv('SPOTIFY_CLIENT_S

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
# Social media scraping template (snscrape)
# Requires: pip install snscrape
try:
    import snscrape.modules.twitter as sntwitter
    social_samples = []
    for _, row in songs_df.iterrows():
        q = f'"{row.title}" {row.artist} -filter:retweets'
        tweets = []
        for i, t in enumerate(sntwitter.TwitterSearchScraper(q).get_items()):
            if i >= 50: break
            tweets.append({'date': t.date, 'id': t.id, 'user': t.user.username, 'content': t.content})
        if tweets:
            df_t = pd.DataFrame(tweets)
            df_t['vader'] = df_t['content'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
            social_samples.append({'artist': row.artist, 'title': row.title, 'tweets': df_t})
    print('Fetched social samples for', len(social_samples))
except Exception as e:
    print('snscrape not available or failed:', e)


snscrape not available or failed: No module named 'snscrape'


In [37]:
# Social graph (co-mention graph)
import networkx as nx
G = nx.Graph()
try:
    for sample in social_samples:
        df_t = sample['tweets']
        for _, t in df_t.iterrows():
            user = t['user']; content = t['content']
            mentions = [w[1:] for w in content.split() if w.startswith('@')]
            G.add_node(user)
            for m in mentions:
                G.add_node(m)
                if G.has_edge(user, m):
                    G[user][m]['weight'] += 1
                else:
                    G.add_edge(user, m, weight=1)
    print('Graph:', G.number_of_nodes(), 'nodes,', G.number_of_edges(), 'edges')
except Exception:
    print('No social graph built.')


No social graph built.


In [39]:
!pip install deltalake




In [41]:
from deltalake import DeltaTable, write_deltalake
import pandas as pd

# Write your dataframe to Delta format
write_deltalake("lyrics_lens_delta", songs_df)

# Read it back
dt = DeltaTable("lyrics_lens_delta")
df_reload = dt.to_pandas()

print("Rows:", len(df_reload))
df_reload.head()


Rows: 5


Unnamed: 0,artist,title,lyrics,url,clean_lyrics,vader_compound,polarity_tb,subjectivity_tb,cluster,lyric_len,popularity,popular_label
0,,Dâ€™Elmar Zaya,3 Contributors\nDâ€™Elmar Zaya Lyrics\n[Songtext...,https://genius.com/Sadiq-delmar-zaya-lyrics,contributor elmar zaya lyric songtext zu zaya ...,-0.9919,-0.041667,0.141667,0,376,3.812166,1
1,,Bis Hierhin,1 Contributor\nBis Hierhin Lyrics\n[Songtext z...,https://genius.com/Kaisa-natron-bis-hierhin-ly...,contributor bi hierhin lyric songtext zu bi hi...,-0.9953,0.2,0.6,0,327,3.216258,0
2,,Losinâ€™ Streak,14 Contributors\nTranslations\nPortuguÃªs\nItal...,https://genius.com/Blake-roman-sam-haft-and-an...,contributor translation portuguÃªs italiano los...,0.9594,0.0685,0.522,1,81,1.815955,0
3,,Love in a Bottle,15 Contributors\nTranslations\nItaliano\nLove ...,https://genius.com/Keith-david-lilli-cooper-ki...,contributor translation italiano love bottle l...,0.9941,0.196984,0.491825,1,166,2.59982,0
4,,VOX POPULI,27 Contributors\nTranslations\nItaliano\nVOX P...,https://genius.com/Jeremy-jordan-christian-bor...,contributor translation italiano vox populi ly...,0.9943,0.096463,0.584596,1,342,4.404194,1


In [43]:
# Save songs_df to Delta format (deltalake)
from deltalake import write_deltalake, read_deltalake
import json
songs_write_df = songs_df.copy()
songs_write_df['lyrics'] = songs_write_df['lyrics'].astype(str)
songs_write_df['clean_lyrics'] = songs_write_df['clean_lyrics'].astype(str)
songs_write_df['nrc_emotions'] = songs_write_df.get('nrc_emotions', pd.Series([{}]*len(songs_write_df))).astype(str)
songs_delta_path = os.path.join(DELTA_DIR, 'songs.delta')
write_deltalake(songs_delta_path, songs_write_df, mode='overwrite')
print('Wrote songs delta to', songs_delta_path)
df_read = read_deltalake(songs_delta_path)
print('Read back rows:', len(df_read))


ImportError: cannot import name 'read_deltalake' from 'deltalake' (/opt/anaconda3/lib/python3.12/site-packages/deltalake/__init__.py)

In [47]:
# Save trained models (joblib)
import joblib, os
models_dir = os.path.join(DATA_DIR, 'models')
os.makedirs(models_dir, exist_ok=True)
joblib.dump(clf_rf, os.path.join(models_dir, 'rf_popularity.pkl'))
joblib.dump(clf_lr, os.path.join(models_dir, 'lr_popularity.pkl'))
print('Saved models to', models_dir)


Saved models to ./lyrics_lens_data/models


In [49]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns


In [53]:
%whos


Variable                     Type                          Data/Info
--------------------------------------------------------------------
BeautifulSoup                type                          <class 'bs4.BeautifulSoup'>
DATA_DIR                     str                           ./lyrics_lens_data
DELTA_DIR                    str                           ./lyrics_lens_data/delta_tables
DeltaTable                   type                          <class 'deltalake.table.DeltaTable'>
G                            Graph                         Graph with 0 nodes and 0 edges
KMeans                       ABCMeta                       <class 'sklearn.cluster._kmeans.KMeans'>
LogisticRegression           type                          <class 'sklearn.linear_mo<...>stic.LogisticRegression'>
MAX_SCRAPED_RECORDS          int                           100
MultiLabelBinarizer          type                          <class 'sklearn.preproces<...>bel.MultiLabelBinarizer'>
REQUEST_SLEEP              

In [55]:
# --- TEST LOGISTIC REGRESSION & RANDOM FOREST ---

print("ðŸ“Œ TEST RESULTS: PREDICTING SONG POPULARITY")
print("\n--- Logistic Regression ---")
y_pred_lr = clf_lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("\n--- Random Forest ---")
y_pred_rf = clf_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


ðŸ“Œ TEST RESULTS: PREDICTING SONG POPULARITY

--- Logistic Regression ---
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2


--- Random Forest ---
Accuracy: 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [57]:
print("ðŸ“Œ KMEANS CLUSTERING TEST")
print("Cluster Labels:", labels)
print("Silhouette Score:", silhouette_score(X_reduced, labels))


ðŸ“Œ KMEANS CLUSTERING TEST
Cluster Labels: [1 0 0 0 1]
Silhouette Score: -0.02078736038327999


In [59]:
print("ðŸ“Œ TOPIC MODELING TEST (LDA)")

for idx, topic in lda.print_topics(num_topics=2, num_words=10):
    print(f"\nTopic {idx}: {topic}")


ðŸ“Œ TOPIC MODELING TEST (LDA)

Topic 4: 0.035*"er" + 0.020*"die" + 0.018*"bi" + 0.015*"sie" + 0.013*"da" + 0.013*"hierhin" + 0.011*"gut" + 0.011*"lief" + 0.011*"hier" + 0.011*"um"

Topic 0: 0.044*"vox" + 0.027*"spoken" + 0.023*"crowd" + 0.015*"got" + 0.013*"hell" + 0.009*"populi" + 0.009*"yeah" + 0.009*"could" + 0.009*"new" + 0.009*"let"


In [61]:
print("ðŸ“Œ SENTIMENT TEST (VADER + TextBlob)\n")

sample_text = songs_df['lyrics'].iloc[0]

vader_result = analyzer.polarity_scores(sample_text)
textblob_result = TextBlob(sample_text).sentiment

print("Sample lyric:", sample_text[:200], "...\n")
print("VADER:", vader_result)
print("TextBlob:", textblob_result)


ðŸ“Œ SENTIMENT TEST (VADER + TextBlob)

Sample lyric: 3 Contributors
Dâ€™Elmar Zaya Lyrics
[Songtext zu â€žD'Elmar Zayaâ€œ]
[Intro]
Ãœberall, wo wir hingeh'n, hinterlassen wir Spuren
Ob es deine DNA ist, deine FingerabdrÃ¼cke oder dein Geruch
Die Geruchsspur'n n ...

VADER: {'neg': 0.096, 'neu': 0.896, 'pos': 0.008, 'compound': -0.9919}
TextBlob: Sentiment(polarity=-0.041666666666666664, subjectivity=0.14166666666666666)


In [63]:
def full_pipeline_predict(lyric):
    clean = preprocess_text(lyric)
    vec = vectorizer.transform([clean])
    reduced = svd.transform(vec)
    
    pred_lr = clf_lr.predict(reduced)[0]
    pred_rf = clf_rf.predict(reduced)[0]

    return {
        "clean_lyric": clean,
        "logistic_prediction": pred_lr,
        "random_forest_prediction": pred_rf
    }

print("ðŸ“Œ FULL PIPELINE TEST")
test_output = full_pipeline_predict("I feel happy and alive, dancing in the golden light")
test_output


ðŸ“Œ FULL PIPELINE TEST


{'clean_lyric': 'feel happy alive dancing golden light',
 'logistic_prediction': 0,
 'random_forest_prediction': 0}

In [65]:
dt = DeltaTable("lyrics_lens_delta")
df_reload = dt.to_pandas()

print("Delta Table Loaded Successfully.")
print(df_reload.head())


Delta Table Loaded Successfully.
  artist             title                                             lyrics  \
0             Dâ€™Elmar Zaya  3 Contributors\nDâ€™Elmar Zaya Lyrics\n[Songtext...   
1              Bis Hierhin  1 Contributor\nBis Hierhin Lyrics\n[Songtext z...   
2            Losinâ€™ Streak  14 Contributors\nTranslations\nPortuguÃªs\nItal...   
3         Love in a Bottle  15 Contributors\nTranslations\nItaliano\nLove ...   
4               VOX POPULI  27 Contributors\nTranslations\nItaliano\nVOX P...   

                                                 url  \
0        https://genius.com/Sadiq-delmar-zaya-lyrics   
1  https://genius.com/Kaisa-natron-bis-hierhin-ly...   
2  https://genius.com/Blake-roman-sam-haft-and-an...   
3  https://genius.com/Keith-david-lilli-cooper-ki...   
4  https://genius.com/Jeremy-jordan-christian-bor...   

                                        clean_lyrics  vader_compound  \
0  contributor elmar zaya lyric songtext zu zaya ...         -0.