# [SquidTV](https://squidtv.net) scraper

This notebook semi-automates the preparation of AmahTV channel lists, by scraping the squidtv.net web site.

In [121]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import parse_qs, urljoin, urlparse

def fetch_page(url):
  response = requests.get(url)
  response.encoding = response.apparent_encoding
  return BeautifulSoup(response.text, 'html.parser')

def video_id_from_livestream_url(livestream_url):
  livestream_soup = fetch_page(livestream_url)
  link = livestream_soup.find('link', {"rel":"canonical"})
  if not link:
    return None
  video_url = link['href']
  if not video_url:
    return None
  video_id = parse_qs(urlparse(video_url).query)['v'][0]
  return video_id

def video_id_from_channel_page_url(channel_page_url):
  channel_page = fetch_page(channel_page_url)
  div = channel_page.find('div', {"class":"yt01"})
  if not div:
    return None
  iframe = div.find('iframe')
  if not iframe:
    return None
  livestream_url = iframe['src']
  if not livestream_url:
    return None
  return video_id_from_livestream_url(livestream_url)

def unwanted_generas_in_genres(genres, unwanted_genres):
  found_unwanted_genres = []
  for g in genres:
    g = g.lower()
    for u in unwanted_genres:
      u = u.lower()
      if u in g:
        found_unwanted_genres.append(g)
  return found_unwanted_genres

def channels_from_country_page_url(country_page_url, unwanted_channels = [], unwanted_genres = []):
  # Use a dict rather than a set because dicts preserve insertion order
  channels = dict()
  for channel_a in fetch_page(country_page_url).find_all('a',{"class": "chlink01"}):
      channel_name = channel_a.text
      if channel_name in unwanted_channels:
        print(f'rejecting {channel_name} due to unwanted channel name')
        continue
      genre_text = channel_a.parent.parent.find('td',{'class':'hpb-cnt-tb-sqtv04-genre'}).text
      genres = tuple([genre.strip() for genre in genre_text.split(',')])
      channel_unwanted_genres = unwanted_generas_in_genres(genres, unwanted_genres)
      if channel_unwanted_genres:
        print(f'rejecting {channel_name} due to unwanted genres {channel_unwanted_genres}')
        continue
      video_id = video_id_from_channel_page_url(urljoin(country_page_url, channel_a['href']))
      if not video_id:
        print(f'rejecting {channel_name} due to not having a video_id.')
        continue
      channels[video_id] = (channel_name, genres)
  return [(channel_name, video_id, genres) for video_id, (channel_name, genres) in channels.items()]


In [125]:
# NTDAPTV is a religious channel.
# 'CTi Asia' seems to show the same content as 'CTi News'
taiwan_channels = channels_from_country_page_url('https://www.squidtv.net/asia/taiwan/',
  unwanted_channels = ['NTDAPTV','CTi Asia'],
  unwanted_genres = ['Finance', 'Shopping', 'Legislature', 'Religion'])


rejecting TTV News Channel due to not having a video_id.
rejecting CTi Asia due to unwanted channel name
rejecting Taiwan Indigenous TV due to not having a video_id.
rejecting TVBS News due to not having a video_id.
rejecting Hakka TV due to not having a video_id.
rejecting EBC News due to not having a video_id.
rejecting NTDAPTV due to unwanted channel name
rejecting UDN TV due to not having a video_id.
rejecting EBC Financial News due to unwanted genres ['news \n              (finance']
rejecting SET iNEWS due to unwanted genres ['finance)']
rejecting EFTV due to unwanted genres ['news \n              (finance)']
rejecting SBN due to unwanted genres ['news \n              (finance)']
rejecting TLTV due to unwanted genres ['shopping']
rejecting Sinda TV due to not having a video_id.
rejecting EBC YOYO TV due to not having a video_id.
rejecting Parliamentary TV due to unwanted genres ['legislature']
rejecting MOMO TV due to unwanted genres ['shopping']
rejecting ET Mall due to unwanted

In [126]:
for name, id, _ in taiwan_channels:
  print(f"Video(name: \"{name}\", id: \"{id}\"),")


Video(name: "FTV", id: "XGEmg3vhrzU"),
Video(name: "CTS News", id: "wM0g8EoUZ_E"),
Video(name: "SET News", id: "FoBfXvlOR6I"),
Video(name: "CTV News Channel", id: "TCnaIE_SAtM"),
Video(name: "CTi News", id: "lu_BJKxqGnk"),
Video(name: "Global News", id: "Fpsi2cmXGMs"),
Video(name: "SJTV", id: "VD8dVG1DF1s"),
Video(name: "SDTV 1", id: "bmtU_o1Mf9E"),
