# Load Sámi Whisper model

> "Also basic pieces for scraping Sveriges Radio pages"

- toc: false
- branch: master
- badges: true
- comments: true
- categories: [sami, whisper, sverigesradio, dt2112]

In [2]:
from transformers import pipeline
import torch

In [3]:
MODEL = "NbAiLab/whisper-large-sme"
LANG = "fi"

In [4]:
if torch.cuda.is_available():
    device = 0
else:
    device = "cpu"

In [None]:
pipe = pipeline(task="automatic-speech-recognition", model=MODEL, chunk_length_s=30, device=device)

In [None]:
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

In [2]:
PAGE = "https://sverigesradio.se/artikel/odda-skearru-bitonsami-ludiiguin"
AUDIOJSON = "https://sverigesradio.se/playerajax/audio?id=8580562&type=publication&publicationid=8580562&quality=medium"

In [3]:
import requests
import json
def get_sverigesradio_audio(page):
    req = requests.get(page)
    if req.status_code != 200:
        return None
    data = json.loads(req.text)
    if "audioUrl" in data:
        return data["audioUrl"]
    return None

In [None]:
!wget {get_sverigesradio_audio(AUDIOJSON)}

In [6]:
from bs4 import BeautifulSoup

In [7]:
def get_audio_id_from_page(page):
    req = requests.get(page)
    if req.status_code != 200:
        return None
    soup = BeautifulSoup(req.text, 'html.parser')
    for elem in soup.findAll("script", {"id": "gtm-metadata"}):
        if "pageId" in elem.text:
            data = json.loads(elem.text)
            return data["pageId"]
    return None

In [8]:
get_audio_id_from_page(PAGE)

'8580562'

In [9]:
def get_audio_from_page(page):
    pageid = get_audio_id_from_page(page)
    if pageid is None:
        return None
    return get_sverigesradio_audio(f"https://sverigesradio.se/playerajax/audio?id={pageid}&type=publication&publicationid={pageid}&quality=medium")

In [None]:
get_audio_from_page(PAGE)