In [None]:
"""
this script sets up and demonstrates two ai pipelines using hugging face libraries
text-to-speech (audio generation): uses microsoft/speecht5_tts to synthesize speech from text
text-to-image (image generation): uses stabilityai/sd-turbo to generate images from text
this runs on google colab, not jupyter notebook
"""
!pip install diffusers transformers accelerate bitsandbytes datasets fsspec==2023.9.2 soundfile

from huggingface_hub import login
from google.colab import userdata
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch
from diffusers import AutoPipelineForText2Image
from IPython.display import Audio, display
from PIL import Image

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
#audio generation
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts", device='cuda')
embeddings_dataset = load_dataset("matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = synthesiser("Why hello there", forward_params={"speaker_embeddings": speaker_embedding})
sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])

Audio("speech.wav")

In [None]:
#image generation
pipe = AutoPipelineForText2Image.from_pretrained(
    "stabilityai/sd-turbo",
    torch_dtype=torch.float16,
    variant="fp16"
).to("cuda")
prompt = "A hundred men fighting one silverback gorilla"
image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]

display(image)