# Web Scraping Notes
1. Scraping for headlines about Coronavirus on ASEAN Nations
2. How to display thumbnails and links on the web dashboard
3. URLs of South East Asian news YouTube Channels
 - Channel News Asia: https://www.youtube.com/user/channelnewsasia/videos
 - The Strait Times: https://www.youtube.com/user/StraitsTimesOnline/videos
 - Today Online: https://www.youtube.com/user/TODAYdigital/videos
 - The Star Online: https://www.youtube.com/user/thestaronline/videos
 - The Malaysian Insight: https://www.youtube.com/user/incitemytv/videos
 - Bangkok Post: https://www.youtube.com/user/bangkokpostvideos/videos
 - Rappler: https://www.youtube.com/user/rapplerdotcom/videos
 - The Jakarta Post: https://www.youtube.com/channel/UC2zhLSPeHaH7fFBsRLf2Z0w/videos
 - Jakarta Globe: https://www.youtube.com/user/beritasatuenglish/videos
 - ANC: https://www.youtube.com/user/ANCalerts/videos
 - NBT World: https://www.youtube.com/channel/UCprW3qy6P2AU_nG-EwS4aRg/videos
 - Vietnam News Agency: https://www.youtube.com/channel/UCN9em1FY03nO7tMV4D2lb-g/videos
 - The Thaiger: https://www.youtube.com/user/PGTVPhuket/videos

In [None]:
# Create a dictionary of urls
_news_channels_ = [{'base': 'Singapore',
                  'channel':'Channel News Asia',
                  'yturl': 'https://www.youtube.com/user/channelnewsasia/videos',
                  'img-src': 'https://yt3.ggpht.com/a/AATXAJyNO1OasteohNJvuwpHJEV6-wN9nt1rofR0Rg=s100-c-k-c0xffffffff-no-rj-mo'
                 }, 
                 {'base': 'Singapore',
                  'channel':'The Strait Times',
                  'yturl': 'https://www.youtube.com/user/StraitsTimesOnline/videos'},
                 {'base': 'Singapore',
                  'channel':'Today Online',
                  'yturl': 'https://www.youtube.com/user/TODAYdigital/videos'},
                 {'base': 'Malaysia',
                  'channel':'The Star Online',
                  'yturl': 'https://www.youtube.com/user/thestaronline/videos'},
                 {'base': 'Malaysia',
                  'channel':'The Malaysian Insight',
                  'yturl': 'https://www.youtube.com/user/incitemytv/videos'},
                 {'base': 'Thailand',
                  'channel':'Bangkok Post',
                  'yturl': 'https://www.youtube.com/user/bangkokpostvideos/videos'},
                 {'base': 'Philippines',
                  'channel':'Rappler',
                  'yturl': 'https://www.youtube.com/user/rapplerdotcom/videos'},
                 {'base': 'Indonesia',
                  'channel':'The Jakarta Post',
                  'yturl': 'https://www.youtube.com/channel/UC2zhLSPeHaH7fFBsRLf2Z0w/videos'},
                 {'base': 'Philippines',
                  'channel':'ANC',
                  'yturl':'https://www.youtube.com/user/ANCalerts/videos'},
                 {'base': 'Thailand',
                  'channel': 'NBT World',
                  'yturl': 'https://www.youtube.com/channel/UCprW3qy6P2AU_nG-EwS4aRg/videos'},
                 {'base': 'Vietnam',
                  'channel': 'Vietnam News Agency',
                  'yturl': 'https://www.youtube.com/channel/UCN9em1FY03nO7tMV4D2lb-g/videos'},
                 {'base': 'Thailand',
                  'channel': 'The Thaiger',
                  'yturl': 'https://www.youtube.com/user/PGTVPhuket/videos'   
                 }
                ]

In [None]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

## Build Scraper Function: `get_latest_ytnewslinks`

In [None]:
def get_latest_ytnewslinks(channel_url):
    '''
    Scrapes YouTube for the URLs of the latest video uploads(news) from a news channel
    Returns a pandas dataframe containing the info on the news.
    '''
    # Create requests object
    res = requests.get(channel_url)
    res.raise_for_status()

    # Create a BeautifulSoup object: soup
    soup = BeautifulSoup(res.text,'lxml')
    
    # Get 5 recent news data
    contents = soup.find_all('h3', class_="yt-lockup-title")[:5]
    
    # Setup dictionary
    news_dict = {'channel': [],
                 'title': [],
                 'url': [],
                 'date': [],
                 'img-src': []
                }
    
    # Populate dictionary
    for content in contents:
        news_dict['channel'].append(soup.find('meta').get('content'))
        news_dict['title'].append(content.find('a').get('title'))
        news_dict['url'].append('https://www.youtube.com' + content.find('a').get('href'))
        print('Added on news_dict:')
        print(soup.find('meta').get('content'),'|',content.find('a').get('title'),'\n')
    
    # Add publish date of the videos
    for item in news_dict['url']:
        video_url = item

        # Create requests object
        vid_res = requests.get(video_url)
        vid_res.raise_for_status()

        # Create a BeautifulSoup object: soup
        vid_soup = BeautifulSoup(vid_res.text,'lxml')
        
        # Parse the soup for the publish date
        for meta in vid_soup.html.body.find_all('meta'):
            if meta.get('itemprop') == 'datePublished':
                news_dict['date'].append(meta.get('content'))
                # Add thumbnail of channel logo
                news_dict['img-src'].append(vid_soup.find('a',class_="yt-user-photo yt-uix-sessionlink spf-link").find('img').get('data-thumb'))
    
    news_df = pd.DataFrame(news_dict)
    news_df['sortkey'] = news_df['date'] + ':' + news_df['title']
    print('DataFrame Completed!')
    
    return news_df

## Scrape Multiple News Channels

In [None]:
def get_all_latestnews():
    '''
    Uses the get_latest_ytnewslinks function to iterate throughout _news_channels_
    Returns a pandas dataframes containing all the news headlines from the chosen news channels
    listed within _news_channels_
    '''
    df_list = []

    for i in range(len(_news_channels_)):
        try:
            df_list.append(get_latest_ytnewslinks(channel_url=_news_channels_[i]['yturl']))
        except:
            print("\nUnable to scrape from", _news_channels_[i]['channel'],'...\n')
            pass

    df = pd.concat(df_list, axis=0)
    df.sort_values(by='sortkey',ascending=False,inplace=True)
    df.reset_index(inplace=True)
    df.drop(columns=['index'],inplace=True)
    df.to_csv('news' + ''.join(str(datetime.utcnow()).split(":")[:-1]) + '.csv')
    df.to_csv('news.csv')

    return df

In [None]:
get_all_latestnews()

## Featured News: `The Jakarta Post`

In [None]:
jakarta_post = [{'title':'Jakarta buries more than 1,000 bodies according to COVID-19 protocol',
                  'date':'04/20/2020',
                  'yturl':'https://www.youtube.com/watch?v=wrn914Nkwg8',
                  'channel':'The Jakarta Post',
                  'desc':"In Jakarta, 1,117 bodies were buried according to COVID-19 protocol between March \
                          7 and April 17, the latest data from the city administration shows. \
                          Jakarta is the epicenter of COVID-19 in Indonesia with almost half of the \
                          6,575 COVID-19 cases in the country having occurred in the capital as of April 19.\
                          Burials under COVID-19 protocol are given to confirmed and probable COVID-19 victims."},
                 {'title': 'Fighting COVID-19: Is Southeast Asia doing enough?',
                  'date': '04/12/2020',
                  'yturl': 'https://www.youtube.com/watch?v=LHVKXa2OPo4',
                  'channel':'The Jakarta Post',
                  'desc':"As the epicenter of the COVID-19 pandemic  moves to the west, Southeast Asia’s \
                          vulnerabilities are beginning to show, particularly in Indonesia. In this episode\
                          by the Asia News Network, editors from Malaysia, Indonesia, Thailand and China tell\
                          us what to expect. (ANN)"},
                 {'title': 'Jakarta’s social restrictions simply explained',
                  'date': '/04/14/2020',
                  'yturl': 'https://www.youtube.com/watch?v=CGl_giD_HGM',
                  'channel': 'The Jakarta Post',
                  'desc':"Jakarta Governor Anies Baswedan has issued a gubernatorial decree on large-scale \
                          social restrictions (PSBB) to further limit people’s movement in the capital for \
                          14 days, starting Friday. \
                          Here’s your guideline for this unprecedented period.\
                          (JP/ Yuliasri Perdani, Vela Andapita and Sausan Atika)\
                          Animation by Sandy Riady and Rian Irawan"},
                 {'title': ,
                  'date': ,
                  'yturl': ,
                  'channel':'The Jakarta Post',
                  'desc':""},
                 {'title': ,
                  'date': ,
                  'yturl': ,
                  'channel':'The Jakarta Post',
                  'desc':""},
                 {'title': ,
                  'date': ,
                  'yturl': ,
                  'channel':'The Jakarta Post',
                  'desc':""},
                 {'title': ,
                  'date': ,
                  'yturl': ,
                  'channel':'The Jakarta Post',
                  'desc':""},
                ]