In [None]:
# Pip installs
!pip install git+https://github.com/johnwmillr/LyricsGenius.git
!pip install -q sentencepiece
!pip install -q transformers
!pip install -q evaluate
import evaluate
!pip install -q rouge_score
#let's make longer output readable without horizontal scrolling
from pprint import pprint

In [None]:
# imports
import requests
from tabulate import tabulate
import csv
import lyricsgenius
import pandas as pd
import re

In [None]:
# Genius setup
# Sahana's API Token
GENIUS_API_TOKEN = 'gYStp8V74DgB-WG7kAjtlySn5lJviczVv6pG9Sp1qpjf05MUb8yJIHvfZUPG9UaP'
genius = lyricsgenius.Genius(GENIUS_API_TOKEN)

# Set up the base URL and headers for the Genius API
base_url = "https://api.genius.com"
headers = {'Authorization': f'Bearer {GENIUS_API_TOKEN}'}

In [None]:
import csv
def export_song_data_to_csv(song_data_list, artist_name):
    # Define all fieldnames, ensuring "Lyrics" is included
    fieldnames = ['Song ID', 'Title', 'Lyrics URL', 'Combined Annotations', 'Wikipedia Annotation', 'Lyrics']
    filename = f"{artist_name}_top_50_songs.csv"

    # Write data to CSV
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        writer.writeheader()  # Write header row
        writer.writerows(song_data_list)  # Write each row from the song data list

In [None]:
# Step 1: Function to search for an artist and retrieve the artist ID
def get_artist_id(artist_name):
    search_url = f"{base_url}/search"
    params = {'q': artist_name}

    # Make a request to search for the artist
    response = requests.get(search_url, headers=headers, params=params)

    if response.status_code == 200:
        results = response.json()
        hits = results['response']['hits']

        # Look for the first matching artist in the search results
        for hit in hits:
            if hit['result']['primary_artist']['name'].lower() == artist_name.lower():
                return hit['result']['primary_artist']['id']

    return None

In [None]:
# Step 2: Function to get all songs by artist ID, but only those with annotations
def get_songs_with_annotations_by_artist_id(artist_id, per_page=50, max_songs=200):
    songs_with_annotations = []
    page = 1

    # Keep fetching songs until we hit the limit or run out of pages
    while len(songs_with_annotations) < max_songs:
        songs_url = f"{base_url}/artists/{artist_id}/songs"
        params = {
            'page': page,
            'per_page': per_page
        }

        response = requests.get(songs_url, headers=headers, params=params)

        if response.status_code == 200:
            song_data = response.json()
            new_songs = song_data['response']['songs']
            if not new_songs:
                break  # Stop if no more songs are returned

            # Filter only songs with annotations
            for song in new_songs:
                if song['annotation_count'] > 0:
                    songs_with_annotations.append(song)

            page += 1
        else:
            break

    return songs_with_annotations

In [None]:
# Step 3: Function to get annotations for a song using the song ID and fetch song URL
def get_song_annotations_and_url(song_id):
    song_url = f"{base_url}/songs/{song_id}"

    # Make a request to get the song data
    response = requests.get(song_url, headers=headers)

    if response.status_code == 200:
        song_data = response.json()['response']['song']
        annotations = []
        lyrics_url = song_data.get('url', 'N/A')  # Fetch the lyrics URL

        # Fetch annotations from the song description if available
        if 'description' in song_data and song_data['description']['dom']:
            dom_annotations = song_data['description']['dom']['children']
            for annotation in dom_annotations:
                if isinstance(annotation, str):  # Plain text annotations
                    annotations.append(annotation)
                elif 'children' in annotation:  # Complex annotations
                    for child in annotation['children']:
                        if isinstance(child, str):
                            annotations.append(child)

        # Combine all annotations into one string
        combined_annotations = " ".join(annotations)

        return lyrics_url, combined_annotations
    else:
        print("Failed to fetch song data.")
        return None, None

In [None]:
# Define the get_lyrics function to fetch lyrics from Lyrics.ovh API
def get_lyrics(artist_name, song_title):
    url = f"https://api.lyrics.ovh/v1/{artist_name}/{song_title}"
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            lyrics = data.get("lyrics")
            if lyrics and lyrics.strip():
                return lyrics
        return None
    except:
        return None

In [None]:
import requests

# Function to fetch Wikipedia annotation for a given song title
def get_wikipedia_annotation(song_title):
    search_url = "https://en.wikipedia.org/w/api.php"
    search_params = {
        "action": "opensearch",
        "format": "json",
        "search": song_title
    }

    response = requests.get(search_url, params=search_params)
    search_data = response.json()

    # Check if we have results
    if len(search_data[1]) > 0:
        # Fetch the first search result
        page_title = search_data[1][0]
        page_url = "https://en.wikipedia.org/w/api.php"
        page_params = {
            "action": "query",
            "format": "json",
            "prop": "extracts",
            "titles": page_title,
            "explaintext": True  # Fetch plain text for easy processing
        }
        page_response = requests.get(page_url, params=page_params)
        page_data = page_response.json()
        pages = page_data.get("query", {}).get("pages", {})

        # Extract the page content
        for page_id, page_content in pages.items():
            return page_content.get("extract", "No Wikipedia annotation found")

    return "No Wikipedia annotation found"


In [None]:
from tabulate import tabulate
import re
import time
import pandas as pd

def get_top_50_songs_with_annotations_by_artist(artist_name):
    # Get artist ID using the existing function
    artist_id = get_artist_id(artist_name)

    if artist_id:
        # Get songs with annotations using existing function
        songs = get_songs_with_annotations_by_artist_id(artist_id)

        if songs:
            # Fetch annotations, URLs, and Wikipedia annotations
            song_data_list = []
            for song in songs:
                song_id = song['id']
                lyrics = get_lyrics(artist_name, song['title'])

                # Only process songs that have lyrics
                if lyrics:
                    lyrics_url, combined_annotations = get_song_annotations_and_url(song_id)
                    wikipedia_annotation = get_wikipedia_annotation(song['title'])

                    song_data_list.append({
                        'Song ID': song_id,
                        'Title': song['title'],
                        'Lyrics URL': lyrics_url,
                        'Combined Annotations': combined_annotations,
                        'Wikipedia Annotation': wikipedia_annotation,
                        'Lyrics': lyrics
                    })

            # Sort after collecting all songs with lyrics
            sorted_songs = sorted(song_data_list, key=lambda x: (-len(x['Combined Annotations']), x['Title']))
            top_50_songs = sorted_songs[:50]

            # Filter Wikipedia annotations silently
            for song_data in top_50_songs:
                if artist_name.lower() not in song_data['Wikipedia Annotation'].lower():
                    song_data['Wikipedia Annotation'] = "No Wikipedia annotation found (artist name not mentioned)"

            return top_50_songs

    return None

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# List of all artists
from google.colab import drive
drive.mount('/content/drive')

# The folder ID "10Fevi_myWRyxiT6waMFhoPANW9r1jrCg"
folder_id = "10Fevi_myWRyxiT6waMFhoPANW9r1jrCg"
save_folder = f"/content/drive/MyDrive/genius_lyrics_outputs"

# Create the folder if it doesn't exist
import os
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
#done with "Morgan Wallen", "SZA", "Taylor Swift", "Drake", "Luke Combs", Miley
artist_names = ["Justin Timberlake", "AC/DC", "Metallica",
    "The Beach Boys", "Seventeen", "The Red Clay Strays", "Creedence Clearwater Revival",
    "Crosby, Stills, Nash & Young", "Linkin Park", "Green Day", "Maroon 5", "Black Eyed Peas", "ABBA",
]

# Process each artist
total_artists = len(artist_names)
for index, artist_name in enumerate(artist_names, 1):
    print(f"Processing {artist_name} ({index}/{total_artists})...", end=" ", flush=True)

    try:
        songs_data = get_top_50_songs_with_annotations_by_artist(artist_name)

        if songs_data:
            safe_filename = re.sub(r'[^\w\s-]', '', artist_name).replace(' ', '_')
            csv_filename = os.path.join(save_folder, f"{safe_filename}_top_50_songs.csv")

            df = pd.DataFrame(songs_data)
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"Done! ({len(songs_data)} songs)")
        else:
            print("No data found")

    except Exception as e:
        print(f"Error: {str(e)}")
        error_log_path = os.path.join(save_folder, 'error_log.txt')
        with open(error_log_path, 'a') as f:
            f.write(f"{artist_name}: {str(e)}\n")
        continue

    time.sleep(2)

print("\nAll processing complete!")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
taylor = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Taylor Swift_top_50_songs_with_annotations.csv')
taylor.head()

In [None]:
!ls /content/drive/MyDrive

In [None]:
# Function to clean up the text
print(taylor['Combined Annotations'])

def clean_text(text):
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove incomplete sentences (e.g., "   ")
    text = re.sub(r'(\s*—\s*)', '', text)  # Handle em-dash
    text = re.sub(r'\s+[,\.]', '', text)  # Remove space before punctuation
    text = re.sub(r'[“”]', '"', text)  # Replace smart quotes
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s{2,}', ' ', text)  # Remove extra spaces again

    # Optionally, further processing can be added for placeholder gaps (e.g., '   ')
    # For now, we can replace any leftover placeholders with a generic "[missing]" or remove them.
    text = re.sub(r'\s{2,}', '[missing]', text)

    # Strip leading/trailing whitespace
    text = text.strip()

    return text

taylor['Combined Annotations'] = taylor['Combined Annotations'].apply(clean_text)

taylor.head()

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [None]:
t5model.summary()

In [None]:
# Using one as an example, otherwise we would loop
ARTICLE_TO_SUMMARIZE = taylor.iloc[2]['Combined Annotations']

In [None]:
# Let's determine the min/max length
#######
# For this very specific example
print(len(taylor.iloc[2]['Combined Annotations']))
print(taylor.iloc[2]['Combined Annotations'])
#######
# Otherwise in general
# Calculate the length of each value in the 'Combined Annotations' column
taylor['annotation_length'] = taylor['Combined Annotations'].apply(len)

# Calculate the average length
average_length = taylor['annotation_length'].mean()

# Display the result
print(f"Average length of 'Combined Annotations': {average_length}")

In [None]:
PROMPT = 'summarize: '
T5ARTICLE_TO_SUMMARIZE = PROMPT + ARTICLE_TO_SUMMARIZE
inputs = t5tokenizer(T5ARTICLE_TO_SUMMARIZE, max_length=1024, truncation=True, return_tensors="tf")
summary_ids = t5model.generate(inputs["input_ids"],
                               max_new_tokens=2000,
                               min_new_tokens=513,
                               no_repeat_ngram_size=3,
                               num_beams=5,
                               early_stopping=True
)
annotation_summary = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
pprint(annotation_summary[0], compact=True)

In [None]:
# Compare to annotation
pprint(taylor.iloc[2]['Combined Annotations'], compact=True)

In [None]:
# Let's use the lyrics
input_text = "summarize: " + taylor.iloc[2]['Lyrics']

# Tokenize the input text
input_ids = t5tokenizer.encode(input_text, return_tensors='tf', max_length=512, truncation=True)

# Generate the summary
summary_ids = t5model.generate(input_ids, max_length=500, num_beams=2, length_penalty=2.0, early_stopping=True)

lyrics_summary = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

pprint(lyrics_summary[0], compact=True)

In [None]:
# Combine both the lyrics summary and annotation summary and see what happens
combined_summary = annotation_summary[0] + " " + lyrics_summary[0]
pprint(combined_summary, compact=True)

input_text = "summarize: " + combined_summary

# Tokenize the input text
input_ids = t5tokenizer.encode(input_text, return_tensors='tf', max_length=512, truncation=True)

# Generate the summary
summary_ids = t5model.generate(input_ids, max_length=50, num_beams=2, length_penalty=2.0, early_stopping=True)

lyrics_and_annotation_summary = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

pprint(lyrics_and_annotation_summary[0], compact=True)

In [None]:
# What if we combined the raw lyrics and annotations?
input_lyrics_and_annotations = taylor.iloc[2]['Lyrics'] + ' ' + taylor.iloc[2]['Combined Annotations']
input_text = "summarize: " + input_lyrics_and_annotations

# Tokenize the input text
input_ids = t5tokenizer.encode(input_text, return_tensors='tf', max_length=512, truncation=True)

# Generate the summary
summary_ids = t5model.generate(input_ids, max_length=50, num_beams=2, length_penalty=2.0, early_stopping=True)

lyrics_and_annotation_summary = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

pprint(lyrics_and_annotation_summary[0], compact=True)

In [None]:
#In order to not consume all of the memory available in Colab we'll free up the memory we're using for these large language models
del t5model
del t5tokenizer

Trying the shorter Pegasus model here

In [None]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration

pmodel = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
ptokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [None]:
inputs = ptokenizer(ARTICLE_TO_SUMMARIZE, max_length=2000, truncation=True, return_tensors="tf")
# Generate Summary
summary_ids = pmodel.generate(inputs["input_ids"]
)
pprint(ptokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0], compact=True)

In [None]:
del pmodel
del ptokenizer

Trying the longer pegasus model here

In [None]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration

cnnmodel = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail", from_pt=True)
cnntokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail", from_pt=True)

cnninputs = cnntokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, truncation=True, return_tensors="tf")

In [None]:
# Generate Summary
summary_ids = cnnmodel.generate(inputs["input_ids"]
)

pprint(cnntokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0], compact=True)

In [None]:
# Character count
# Good Grammar
# References song lyrics
# Complete sentences

