# [SquidTV](https://squidtv.net) scraper

This notebook semi-automates the preparation of AmahTV channel lists, by scraping the squidtv.net web site.

In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import parse_qs, urljoin, urlparse

def fetch_page(url):
  response = requests.get(url)
  response.encoding = response.apparent_encoding
  return BeautifulSoup(response.text, 'html.parser')

def video_id_from_livestream_url(livestream_url):
  livestream_soup = fetch_page(livestream_url)
  link = livestream_soup.find('link', {"rel":"canonical"})
  if not link:
    return None
  video_url = link['href']
  if not video_url:
    return None
  video_id = parse_qs(urlparse(video_url).query)['v'][0]
  return video_id

def video_id_from_channel_page_url(channel_page_url):
  channel_page = fetch_page(channel_page_url)
  div = channel_page.find('div', {"class":"yt01"})
  if not div:
    return None
  iframe = div.find('iframe')
  if not iframe:
    return None
  livestream_url = iframe['src']
  if not livestream_url:
    return None
  return video_id_from_livestream_url(livestream_url)

def unwanted_generas_in_genres(genres, unwanted_genres):
  found_unwanted_genres = []
  for g in genres:
    g = g.lower()
    for u in unwanted_genres:
      u = u.lower()
      if u in g:
        found_unwanted_genres.append(g)
  return found_unwanted_genres

def channels_from_country_page_url(country_page_url, unwanted_channels = [], unwanted_genres = []):
  # Use a dict rather than a set because dicts preserve insertion order
  channels = dict()
  for channel_a in fetch_page(country_page_url).find_all('a',{"class": "chlink01"}):
      channel_name = channel_a.text
      if channel_name in unwanted_channels:
        print(f'rejecting {channel_name} due to unwanted channel name')
        continue
      genre_text = channel_a.parent.parent.find('td',{'class':'hpb-cnt-tb-sqtv04-genre'}).text
      genres = tuple([genre.strip() for genre in genre_text.split(',')])
      channel_unwanted_genres = unwanted_generas_in_genres(genres, unwanted_genres)
      if channel_unwanted_genres:
        print(f'rejecting {channel_name} due to unwanted genres {channel_unwanted_genres}')
        continue
      video_id = video_id_from_channel_page_url(urljoin(country_page_url, channel_a['href']))
      if not video_id:
        print(f'rejecting {channel_name} due to not having a video_id.')
        continue
      channels[video_id] = (channel_name, genres)
  return [(channel_name, video_id, genres) for video_id, (channel_name, genres) in channels.items()]

def print_swift_source_for_channels(channels):
  for name, id, genres in channels:
    print(f"Video(name: \"{name}\", id: \"{id}\"), // {','.join(genres)}")


In [6]:
# NTDAPTV is a religious channel.
# 'CTi Asia' seems to show the same content as 'CTi News'
# 'SJTV' is low quality (lots of shopping)
# 'SDTV 1' is low quality (lots of shopping)
print_swift_source_for_channels(channels_from_country_page_url('https://www.squidtv.net/asia/taiwan/',
  unwanted_channels = ['NTDAPTV','CTi Asia', 'SDTV 1', 'SJTV'],
  unwanted_genres = ['Finance', 'Shopping', 'Legislature', 'Religion']))


rejecting CTS News due to not having a video_id.
rejecting TTV News Channel due to not having a video_id.
rejecting CTI News due to not having a video_id.
rejecting CTI Asia due to not having a video_id.
rejecting Taiwan Indigenous TV due to not having a video_id.
rejecting TVBS News due to not having a video_id.
rejecting mnews due to not having a video_id.
rejecting EBC News due to not having a video_id.
rejecting SET News due to not having a video_id.
rejecting Hakka TV due to not having a video_id.
rejecting Global News due to not having a video_id.
rejecting NTDAPTV due to unwanted channel name
rejecting EBC Financial News due to unwanted genres ['news \n              (finance']
rejecting SET iNEWS due to unwanted genres ['finance)']
rejecting EFTV due to unwanted genres ['news \n              (finance)']
rejecting SBN due to unwanted genres ['news \n              (finance)']
rejecting Sinda TV due to not having a video_id.
rejecting SDTV 1 due to unwanted channel name
rejecting E

In [7]:
print_swift_source_for_channels(channels_from_country_page_url('https://www.squidtv.net/asia/japan/',
  unwanted_channels = ["Weathernews LiVE"],
  unwanted_genres = ['Finance', 'Shopping', 'Legislature', 'Religion']))
japan_channels

rejecting NHK WORLD-JAPAN due to not having a video_id.
rejecting TBS NEWS DIG due to not having a video_id.
rejecting Nittele News 24 due to not having a video_id.
rejecting Nippon TV News 24 Japan due to not having a video_id.
rejecting Weathernews LiVE due to unwanted channel name
rejecting ABEMA due to not having a video_id.
rejecting Asahi Shimbun Digital due to not having a video_id.
rejecting HTB Hokkaido News 24 due to not having a video_id.
rejecting STV News Hokkaido due to not having a video_id.
rejecting HBC Hokkaido News 24 due to not having a video_id.
rejecting Niigata News NST due to not having a video_id.
rejecting UTY News 24h due to not having a video_id.
rejecting TV Shizuoka News due to not having a video_id.
rejecting SATV News 24H due to not having a video_id.
rejecting SBS NEWS due to not having a video_id.
rejecting TV Kanazawa due to not having a video_id.
rejecting Kansai News 24 (ABC TV) due to not having a video_id.
rejecting Hiroshima News 24 due to not ha

NameError: name 'japan_channels' is not defined