<a href="https://colab.research.google.com/github/leman-cap13/my_projects/blob/main/Generate_lyrics_and_spoken_audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model

In [None]:
!pip install transformers
!pip install TTS
!pip install torchaudio

transformers: For GPT-2 (lyrics generation)

TTS: From Coqui (text-to-speech synthesis)

torchaudio: To play audio in Colab



In [None]:
# install gpt 2 for lyrics generating
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch


model_name = "google/flan-t5-base"
# model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.eval()

What is GPT2LMHeadModel?
It’s the GPT-2 architecture model with a language modeling (LM) head on top.

The LM head is specifically designed for generating text by predicting the next token in a sequence.

This model is pre-trained on large amounts of text data to learn how to produce coherent, fluent language when given a prompt.

Why use GPT2LMHeadModel for your lyrics generation?
Text Generation Focus:
The LM head enables the model to predict next words, so it can generate new text (lyrics) from a starting prompt.

Pretrained, Large-Scale:
GPT-2 is trained on diverse internet text, including creative writing styles, so it can generate song-like language even without fine-tuning.

Easy to Use:
Hugging Face provides this model ready for generation tasks — no extra modification needed.

Control:
You can control generation parameters (temperature, top-k, top-p) for creative or conservative outputs.

In [None]:
prompt = "Write a romantic pop song about heartbreak. Make it emotional and poetic."

inputs = tokenizer(prompt, return_tensors="pt")


min_lines = 10
max_tries = 5
lyrics = ""

for attempt in range(max_tries):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=250,
        min_length=100,  # ⬅️ force longer outputs
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=3
    )



max_length=100: generate up to 100 tokens (including the prompt tokens).

do_sample=True: use sampling instead of greedy decoding, allowing for creative, varied outputs.

temperature=0.8: controls randomness — lower is more conservative, higher is more creative.



In [None]:
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Extract generated lyrics
# lyrics = generated[len(prompt):].strip()
lyrics = generated.strip()
print("🎤 Generated Lyrics:\n", lyrics)

In [None]:
from TTS.api import TTS

# Load Coqui TTS model (lightweight, CPU-friendly)
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)

# Save spoken audio to file
tts.tts_to_file(text=lyrics, file_path="lyrics.wav")


In [None]:
import IPython.display as ipd

# Play the audio file
ipd.Audio("lyrics.wav")


#Fine Tuned model

In [None]:
# Install required packages
!pip install transformers datasets

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d deepshah16/song-lyrics-dataset

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('/content/song-lyrics-dataset.zip', 'r')
zip_ref.extractall()

In [None]:
# Step 1: Load your lyrics CSV file (upload lyrics.csv in Colab)
df = pd.read_csv("/content/csv/BillieEilish.csv")
df== df.dropna(subset=["Lyric"])  # Drop rows without lyrics
df

In [None]:
# Step 2: Convert to HuggingFace Dataset and split
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
print(f"Dataset split: {dataset}")

In [None]:
# Step 3: Load tokenizer and set pad token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token  # GPT2 does not have a pad token by default

In [None]:
# Step 4: Tokenize the lyrics
def tokenize_function(examples):
    tokens = tokenizer(
        examples["Lyric"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# Step 5: Load GPT-2 medium model and resize embeddings
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-lyrics-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if possible
)

In [None]:
# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

In [None]:
# Step 8: Train the model
trainer.train()

In [None]:
# Step 9: Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2-lyrics-finetuned")
tokenizer.save_pretrained("./gpt2-lyrics-finetuned")

In [None]:
# Step 10: using to(device())
from transformers import GPT2LMHeadModel, GPT2Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the saved model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-lyrics-finetuned").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-lyrics-finetuned")
tokenizer.pad_token = tokenizer.eos_token  # necessary fix


In [None]:
# Step 11: Generate lyrics with the fine-tuned model
prompt = "A romantic song about heartbreak:\n"
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

output = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=200,            # allow longer output
    min_length=100,            # enforce minimum length
    do_sample=True,
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    no_repeat_ngram_size=3,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.eos_token_id,    # pad token fix for GPT2
    eos_token_id=tokenizer.eos_token_id,    # stop generation at EOS token
    num_return_sequences=1,
)

print("Raw output token IDs:\n", output[0])

lyrics = tokenizer.decode(output[0], skip_special_tokens=True)
lyrics = lyrics[len(prompt):].strip()

print("\n🎤 Generated Lyrics:\n")
print(lyrics if lyrics else "[No lyrics generated]")


In [None]:
!pip install TTS
!pip install torchaudio

In [None]:
from TTS.api import TTS

# Load Coqui TTS model (lightweight, CPU-friendly)
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)

# Save spoken audio to file
tts.tts_to_file(text=lyrics, file_path="lyrics.wav")

file_path="lyrics.wav" is just the filename (and optionally path) where your generated audio will be saved.



In [None]:
import IPython.display as ipd

# Play the audio file
ipd.Audio("lyrics.wav")

In [None]:
import matplotlib.pyplot as plt

# Assuming logs are saved in './logs/events.out.tfevents...'
# Or access trainer.state.log_history if available

train_losses = []
eval_losses = []

for log in trainer.state.log_history:
    if "loss" in log:
        train_losses.append(log["loss"])
    if "eval_loss" in log:
        eval_losses.append(log["eval_loss"])

plt.plot(train_losses, label="Train Loss")
plt.plot(eval_losses, label="Eval Loss")
plt.xlabel("Logging Steps")
plt.ylabel("Loss")
plt.legend()
plt.show()


In [None]:
!pip install bertviz

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
from bertviz import head_view

# Load model with output_attentions=True
model = GPT2Model.from_pretrained("gpt2", output_attentions=True)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

text = "Here is some sample text for attention visualization."
inputs = tokenizer(text, return_tensors="pt")

outputs = model(**inputs)

# Get attentions from outputs
attentions = outputs.attentions  # tuple of tensors

# Visualize attention heads with bertviz
head_view(attentions, tokenizer.tokenize(text))



In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = lyrics  # your generated lyrics string

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
from textblob import TextBlob

# Map sentiment to color codes (these are ANSI 256-color codes)
color_map = {
    'green': 82,   # bright green
    'red': 196,    # bright red
    'gray': 244    # gray
}

lines = lyrics.split('\n')
for line in lines:
    polarity = TextBlob(line).sentiment.polarity
    if polarity > 0.2:
        color_code = color_map['blue']
    elif polarity < -0.2:
        color_code = color_map['gray']
    else:
        color_code = color_map['green']
    print(f"\033[38;5;{color_code}m{line}\033[0m")


In [None]:
import matplotlib.pyplot as plt
from textblob import TextBlob

import re

# Original single-line lyrics
lyrics = "'Cause it's cold in my chest, I'm feeling like an animal falling through the air when you call me back home again , don't worry babe we'll be right there waiting to tell ya what to do i know that if they're gonna hang us up then maybe let a rope run them down before hanging 'em all and burning their bodies off without ever saying goodbye oh yeah love is nothing but one more minute from this moment on so stay warm baby"

# Split on commas, conjunctions, or just every 10 words
rough_lines = re.split(r',|\band\b|\bso\b', lyrics)

# Optionally: Clean and strip each line
lines = [line.strip().capitalize() for line in rough_lines if line.strip()]



sentiments = [TextBlob(line).sentiment.polarity for line in lines]

# Plotting
plt.figure(figsize=(10, 4))
plt.plot(sentiments, marker='o', linestyle='-', color='teal')
plt.title("Sentiment Progression Over Raw Generated Lyrics")
plt.xlabel("Line Number")
plt.ylabel("Sentiment Polarity")
plt.grid(True)
plt.show()

# Optional: print line with its polarity
for i, (line, score) in enumerate(zip(lines, sentiments)):
    print(f"{i+1:02d}. ({score:.2f}) {line}")


In [None]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Example lyrics (use your actual `lyrics` variable here)
lines = lyrics.split('\n')

# Set figure size
rcParams['figure.figsize'] = (8, len(lines) * 0.5)  # adjust height per number of lines

# Create figure and axis
fig, ax = plt.subplots()

# Hide axes
ax.axis('off')

# Combine lines back into a single string for rendering
text = "\n".join(lines)

# Add text to the figure
ax.text(0.5, 0.5, text, ha='center', va='center', fontsize=14, wrap=True, family='monospace')

# Show the plot
plt.title("🎵 Generated Lyrics", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
from PIL import Image, ImageDraw, ImageFont
import textwrap

# Your generated lyrics
text = lyrics.strip()

# Settings
width = 800
padding = 50
background_color = "black"
text_color = "white"
font_size = 28
line_spacing = 10

# Load a font (fallback to default if font not found)
try:
    font = ImageFont.truetype("arial.ttf", font_size)
except:
    font = ImageFont.load_default()

# Wrap text
wrapper = textwrap.TextWrapper(width=40)
lines = wrapper.wrap(text=text)

# Estimate height
height = padding * 2 + (font_size + line_spacing) * len(lines)

# Create image
img = Image.new("RGB", (width, height), color=background_color)
draw = ImageDraw.Draw(img)

# Draw each line
y_text = padding
for line in lines:
    draw.text((padding, y_text), line, font=font, fill=text_color)
    y_text += font_size + line_spacing

# Save and display
img_path = "lyrics_poster.png"
img.save(img_path)

# Display in Colab
import IPython.display as display
display.display(img)


In [None]:
# Settings
width = 600
padding = 10
background_color = "pink"
text_color = "black"
font_size = 16
line_spacing = 8

try:
    font = ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Italic.ttf", font_size)

except:
    font = ImageFont.load_default()




wrapper = textwrap.TextWrapper(width=40)
lines = wrapper.wrap(text=text)


height = padding * 2 + (font_size + line_spacing) * len(lines)

img = Image.new("RGB", (width, height), color=background_color)
draw = ImageDraw.Draw(img)


y_text = padding
for line in lines:
    draw.text((padding, y_text), line, font=font, fill=text_color)
    y_text += font_size + line_spacing

img_path = "lyrics_poster.png"
img.save(img_path)

import IPython.display as display
display.display(img)


In [None]:
import os

possible_folders = [
    "/usr/share/fonts",
    "/usr/local/share/fonts",
    os.path.expanduser("~/.fonts"),
]

found = False
for folder in possible_folders:
    if os.path.exists(folder):
        print(f"Fonts qovluğu tapıldı: {folder}")
        print("İçindəkilər:")
        for root, dirs, files in os.walk(folder):
            for file in files:
                if file.lower().endswith(".ttf"):
                    print(os.path.join(root, file))
        found = True
        break

if not found:
    print("Heç bir fonts qovluğu tapılmadı.")


