# Crawler

In [None]:
! pip install requests
! pip install BeautifulSoup4
! pip install nltk
import requests as r
import json
import time
import re
import nltk
import datetime
import re
import os
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from bs4 import BeautifulSoup
from collections import Counter
from concurrent.futures import ThreadPoolExecutor

def fetch_bbc():
    start_date = datetime.date(2015,7,1)
    end_date = datetime.date(2023,11,9)
    current_date = start_date
    sleep=1
    while current_date <= end_date:
        year = current_date.year
        month = current_date.month
        day = current_date.day
        print(f"Year: {year}, Month: {month}, Day: {day}")
        url = f"https://dracos.co.uk/made/bbc-news-archive/{year}/{month:0>2}/{day:0>2}/"
        res, error = retry(url)
        if not error:
            html = res.text
            bf = BeautifulSoup(html)
            url_set = set([x['href'] for x in bf.find_all('a', href=True) if re.search(r"\d{8}", x['href'])])
            urls = enumerate(url_set)
            with ThreadPoolExecutor(max_workers=15) as executor:
                {executor.submit(crawler,url,'bbc',idx, f"{year}{month:0>2}{day:0>2}"):(idx, url) for idx, url in urls}
        current_date += datetime.timedelta(days=1)

def fetch_nbc():
    months = [
    'january', 'february', 'march', 'april', 'may', 'june', 
    'july', 'august', 'september', 'october', 'november', 'december'
    ]
    for year in range(2003,2024):
        for month in months:
            print(f"Year: {year}, Month: {month}")
            done = False
            pageNo = 1
            page_list = []
            while not done:
                url = f"https://www.nbcnews.com/archive/articles/{year}/{month}/"
                if pageNo != 1:
                    url = f"https://www.nbcnews.com/archive/articles/{year}/{month}/{pageNo}/"
                res, error = retry(url)
                if not error:
                    html = res.text
                    bf = BeautifulSoup(html)
                    if pageNo == 1:
                        page_list = [x.text for x in bf.find_all('a', class_='Pagination__num Pagination__enable')]
                    url_set = set([x['href'] for x in bf.main.find_all('a', href=True)])
                    urls = enumerate(url_set)
                    month_no = months.index(month)+1
                    with ThreadPoolExecutor(max_workers=15) as executor:
                        {executor.submit(crawler,url,'nbc',idx, f"{year}{month_no:0>2}{pageNo}"):(idx, url) for idx, url in urls}
                if len(page_list) == 0 or pageNo == int(page_list[-1]) or error: 
                    done = True
                else:
                    pageNo = int(page_list[pageNo-1])
                    
def fetch_cnn():
    for year in range(2011,2024):
        url = f"https://edition.cnn.com/article/sitemap-{year}.html"
        res, error = retry(url)
        if not error:
            html = res.text
            bf = BeautifulSoup(html)
            url_set = set([x['href'] for x in bf.body.find_all('section')[0].find_all('a', href=True)])
            print(url_set)
            for url in url_set:
                url = f"https://edition.cnn.com{url}"
                res, error = retry(url)
                if not error:
                    html = res.text
                    bf = BeautifulSoup(html)
                    url_list = [x['href'] for x in bf.body.find_all('div', class_='sitemap-entries')[0].find_all('a', href=True)]
                    date_list = [x.text for x in bf.body.find_all('div', class_='sitemap-entries')[0].find_all('span', class_='date')][1:]
                    with ThreadPoolExecutor(max_workers=15) as executor:
                        {executor.submit(crawler,url,'cnn',idx, date_list[idx]):(idx, url) for idx, url in enumerate(url_list)}

def fetch_dailymail():
    for year in range(1994,2024):
        url = f"https://www.dailymail.co.uk/home/sitemaparchive/year_{year}.html"
        res, error = retry(url)
        if not error:
            html = res.text
            bf = BeautifulSoup(html)
            url_set = set([x['href'] for x in bf.body.find_all('ul', class_='archive-index home link-box')[0].find_all('a', href=True)])
            for url_ in url_set:
                url = f"https://www.dailymail.co.uk{url_}"
                res, error = retry(url)
                if not error:
                    html = res.text
                    bf = BeautifulSoup(html)
                    url_list = [x['href'] for x in bf.body.find_all('ul', class_='archive-articles debate link-box')[0].find_all('a', href=True)]
                    date = url_.split(".")[0][-8:]
                    with ThreadPoolExecutor(max_workers=15) as executor:
                        {executor.submit(crawler,f"https://www.dailymail.co.uk{url}",'dailymail',idx, date):(idx, url) for idx, url in enumerate(url_list)}

                
def crawler(url,source,idx,date):
    #print(url,source,idx,date)
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        res = r.get(url,headers)
        time.sleep(3)
        if(res.status_code == 200):
            html = res.text
            bf = BeautifulSoup(html)
            data = {}
            if source == "bbc":
                os.makedirs("crawl/bbc", exist_ok=True)
                file_path = "crawl/bbc/"+f"{date}{idx:0>3}"+".json"
                data['title'] = " ".join(bf.h1.text.split())
                data['url'] = url
                data['date'] = bf.find('time')['datetime'][:10]
                data['keywords'] = keywords_generator(re.sub(r"[^\w\s]+",""," ".join(bf.find('article').text.split()).lower()))
                data['category'] = url[8:].split("/")[1]
                if url[8:].split("/")[1] == "news":
                    pattern = r"news/(.*?\d{8})"
                    match = re.search(pattern, url)
                    if match:
                        array = match.group(0).split('/')
                        last_item = array[-1]
                        if re.match(r".*-\d{8}$", last_item):
                            array[-1:] = last_item.rsplit('-', 1)
                        data['category'] = array[:-1]
                if url[8:].split("/")[1] == "sport":
                    pattern = r"sport/(.*?)/\d{8}"
                    match = re.search(pattern, url)
                    if match:
                        data['category'] = match.group(0).split('/')[:-1]
                json_object = json.dumps(data, indent=4)
                with open(file_path, "w") as outfile:
                    outfile.write(json_object)
            if source == "nbc":
                os.makedirs("crawl/nbc", exist_ok=True)
                file_path = "crawl/nbc/"+f"{date}{idx:0>4}"+".json"
                data['title'] = " ".join(bf.h1.text.split())
                data['url'] = url
                data['date'] = bf.find('time')['datetime'][:10]
                data['keywords'] = keywords_generator(re.sub(r"[^\w\s]+",""," ".join(bf.find('article').text.split()).lower()))
                data['category'] = [bf.find_all('span', attrs={'data-testid': 'unibrow-text'})[0].text.lower()]
                json_object = json.dumps(data, indent=4)
                with open(file_path, "w") as outfile:
                    outfile.write(json_object)
            if source == "cnn":
                os.makedirs("crawl/cnn", exist_ok=True)
                file_path = "crawl/cnn/"+date.replace("-","")+f"{idx:0>4}"+".json"
                data['title'] = " ".join(bf.h1.text.split())
                data['url'] = url
                pattern = r'[A-Za-z]+\s([A-Za-z]+)\s(\d+),\s(\d{4})'
                match = re.search(pattern, " ".join(bf.find('div', class_='timestamp').text.split()))
                data['date'] = f"{int(match.group(3))}-{time.strptime(match.group(1), '%B').tm_mon:02d}-{int(match.group(2)):02d}"
                data['keywords'] = keywords_generator(re.sub(r"[^\w\s]+",""," ".join(bf.find('div', class_='article__content-container').text.split()).lower()))
                category = url.split("/")
                data['category'] = [category[6]]
                if "-" not in category[7]:
                   data['category'] = [category[6],category[7]]
                json_object = json.dumps(data, indent=4)
                #print(json_object)
                with open(file_path, "w") as outfile:
                    outfile.write(json_object)
            if source == "dailymail":
                os.makedirs("crawl/dailymail", exist_ok=True)
                file_path = "crawl/dailymail/"+date.replace("-","")+f"{idx:0>4}"+".json"
                data['title'] = " ".join(bf.h1.text.split())
                data['url'] = url
                data['date'] = f"{date[:4]}-{date[4:6]}-{date[6:]}"
                data['keywords'] = keywords_generator(re.sub(r"[^\w\s]+",""," ".join(bf.find_all(attrs={"itemprop": "articleBody"})[0].text.split()).lower()))
                category = url.split("/")
                data['category'] = [category[3]]
                if "-" not in category[4]:
                   data['category'] = [category[3],category[4]]
                json_object = json.dumps(data, indent=4)
                #print(json_object)
                with open(file_path, "w") as outfile:
                    outfile.write(json_object)
    except Exception as e:
        print(f'Error occurred while fetching {url}: {e}')
        pass
        
def keywords_generator(content):
    tokens = word_tokenize(content)
    tags = pos_tag(tokens) 
    nouns = [word for (word, tag) in tags if tag == "NN"] 
    stop_words = set(stopwords.words('english'))
    keywords = [word for word in nouns if word not in stop_words]
    top_keywords = [x[0] for x in Counter(keywords).most_common(100)]
    return top_keywords

def retry(url):
    
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    print(url)
    for attempt in range(5):
        try:
            res = r.get(url,headers)
            return res, False
        except r.exceptions.RequestException as e:
            print(f'Error occurred while fetching {url}: {e}')
            if attempt < 4:
                print(f'Retrying in {2} seconds...attempt: {attempt}')
                time.sleep(2)
                continue
            else:
                print('Max retries exceeded. Giving up.')
                return None, True
fetch_bbc()
fetch_nbc()
fetch_cnn()
fetch_dailymail()