### Importing Libraries

In [1]:
!pip install -qq newspaper3k
import requests
import bs4
import urllib3
import time
from datetime import datetime, timedelta
import re
import pandas as pd
import numpy as np
from pandas import json_normalize
import multiprocessing as mp
import traceback
import pickle
import psutil
import time
from pathlib import Path
from newspaper import Article
from newspaper import Config
config = Config()
config.request_timeout = 60

import os
import sys
sys.setrecursionlimit(10000)


#import helper as TOI

#starting date of TOI articles
benchmark_date = datetime(2001,1,1)

#day_id which is used to get url of any article
#this id can be calculated using banchmark_date & benchmark_id
benchmark_day_id = 36892

#path to store articles
ArticleData = "ArticlesData/TOI/"
Path(ArticleData).mkdir(parents=True, exist_ok=True)

Airquality = ArticleData+"Airquality/"
Path(Airquality).mkdir(parents=True, exist_ok=True)

ContentNotFound = ArticleData+"ContentNotFound/"
Path(ContentNotFound).mkdir(parents=True, exist_ok=True)

Exceptions = ArticleData+"Exceptions/"
Path(Exceptions).mkdir(parents=True, exist_ok=True)

HtmlResponses = ArticleData + "HtmlResponses/"
Path(HtmlResponses).mkdir(parents=True, exist_ok=True)


import logging
import sys

LOGGING_ENABLED = False
if LOGGING_ENABLED:
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
    logger = logging.getLogger()
    file_handler = logging.FileHandler('TOI_2.log', 'a')
    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
    logger.addHandler(file_handler)
    print = logger.info

print("test")

test


### Scrapping Helper Functions

In [2]:
#To scrap full text response of html page for given date
#input: datetime object
#output: List -> [date_string,html_page_response]

def get_calender_day_response(datetime_obj):
    date = datetime_obj
    day = date.day
    month = date.month
    year = date.year
    
    #difference between current date & starting date(benchmark date)
    diff = date - benchmark_date
    
    #day_id = banchmark_day_if + difference(calculated above)
    day_id = benchmark_day_id + diff.days
    
    #generate a link using year, month, day, day_id
    link =  f"https://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{day_id}.cms"
    
    #gettig html response
    resp = requests.get(link).text
    return [date,resp]

#to extract only articles links (501 urls) from page_response which is scrapped by get_calender_day_response() function
#input: [date,html_page_response]
#output: List of shape :n_articles * 3 
#        columns = [article_url,article_heading,article_date]

def get_articles_urls(day_page_resp):
    date = day_page_resp[0]
    page_response = day_page_resp[1]
    
    #scrapper from BeautifulSoap
    scrapper = bs4.BeautifulSoup(page_response,"html.parser")
    
    #extract all tables
    tables = scrapper.find_all('table')
    
    #scrap 3rd table which contains 501 article urls
    article_table = str(tables[2])
    
    #scrapper to scrap only article links from table
    soup =  bs4.BeautifulSoup(article_table,"html.parser")
    urls = soup.find_all("a")
    home_url = "https://timesofindia.indiatimes.com"
    complete_urls = []
    for each in urls:
        url = each.get('href').strip()
        #if url not starts with 'http' then append it with home_url
        if url[:4] != "http":
            url  = home_url + url
            
        #each.text gives heading of article
        complete_urls.append({"url":url,"title":each.text,"date":date,"media":"TOI"})
        
    return complete_urls


def TOI_get_author_and_city(resp):
    scrapper =  bs4.BeautifulSoup(resp,"html.parser")
    #find tag where category is mentioned
    category_tag = scrapper.find_all("div",class_ = ["YhAlT","navbdcrumb"])
    if category_tag == []:
        category_tag = scrapper.find_all("div",attrs = {"id":"breadcrumb"})

    #find tag where author name is mentioned
    author_tag = scrapper.find_all("div",class_ = ["yYIu- byline","as_byline","byline-content"])

    #make a string
    author_str = ""
    category_str = ""

    for each in author_tag:
        author_str += each.text

    for each in category_tag:
        category_str += each.text
    #print(category_str,"\n",author_str)
    
    return [author_str,category_str]
    


### Common Functions between Hindu & TOI

In [3]:
def get_regex_string_from_single_word(keyword,space=False,dostrip=True):
    
    #to check empty string before the keyword
    re_str = ""
    space = ""
    if space:
        space = "\\b"
        re_str+=space
    keyword = keyword.lower()
    if dostrip:
        keyword = keyword.strip()
    words = keyword.split(" ")
    n = len(words)
    for ind,word in enumerate(words):
        re_str+=word
        if ind+1 < n:
            #to accept white space between two word
            re_str+="\\s"
    
    #to check empty string after the keyword
    re_str+= space
    return re_str


keyword_df = pd.read_csv("airpollution_keywords.csv")
keyword_df["regex_string"] = keyword_df["keywords"].apply(get_regex_string_from_single_word)
keyword_df["keywords"] = keyword_df["keywords"].apply(lambda x:x.lower())
keywords_count = dict(zip(keyword_df.keywords, [0]*len(keyword_df.keywords)))
#get regex matching string
keywords_match_string = ""
for each in keyword_df["regex_string"]:
    keywords_match_string+=each
    keywords_match_string+=" | "
keywords_match_string = keywords_match_string[:-3]
print("regex matching string:\n",keywords_match_string)


def get_air_quality_keywords(text,match_string):
    matcher = re.compile(match_string,flags=re.I | re.X)
    matches = matcher.findall(text)
    
    #update count in dict
    count = {}
    for each in matches:
        key = each.strip().lower()
        try:
            count[key]+=1
        except:
            count[key] = 1
    return count

def scrape_airquality_article(url_data):
    if "//ads." in url_data["url"]:
        return ["aid","no macthes"]
    author = ""
    category = ""
    city = ""
    try:
        url,title,datetime_obj = url_data["url"].strip(), url_data["title"], url_data["date"]
        if "category" in url_data:
            category = url_data["category"]
        #newspaper3k library
        article = Article(url,config = config)
        article.download()
        article.parse()
        
        #article text not found
        if len(article.text.strip()) < 10:
            return ["notfound",url_data]
        #airuality related keywords
        matches = get_air_quality_keywords(article.text,keywords_match_string)

        articles_json = {}
        #atleast one keyword found
        if len(matches) > 0:
            print(f"Found in {category}\n{title}")
            #print(url)
            
            #get author & city
            if url_data["media"] == "hindu":
                author, city = hindu_get_author_and_city(article.html)
            if url_data["media"] == "TOI":
                author, city = TOI_get_author_and_city(article.html)
            #print(f"Author = {author}\nCity = {city}")       
            articles_json = {"date":datetime_obj.date(),
                        "url":url,
                        "heading":title,
                        "content":article.text,
                        "other":{"author":author,"city":city,"top_image":article.top_image,
                                 "category":category,"keywords":article.meta_keywords,
                                 "tags":list(article.tags)},
                        "matches":matches
                        }
            return [articles_json,article.html]
        #print("found")
        return [articles_json,"No macthes"]
    
    except Exception as e:
    
        url_data["exception_title"] = str(type(e))
        url_data["exception_info"] = traceback.format_exc()
        print("****************************************Exception***********************************************")
        print(traceback.format_exc())
        print("***************************************************************************************")
        return ["exception",url_data]
    
def store_article_data(articles,fname):
    #load content_not_found_urls [list]
    content_not_found_urls = []
    
    #load data of air_quality_articles [list]
    air_quality_articles = []
    
    #html responses
    html_resp = []

    #load urls where exceptions were occured
    exceptions = []

    
    for each in articles:
        #if it is air quality related article
        if type(each[0]) == dict and each[0] != {}:
            #store structured json scrapped data
            air_quality_articles.append(each[0])

            #store html_text_response of page
            html_json = {"heading":each[0]["heading"],"html":each[1]}
            html_resp.append(html_json)

        #exception occured in get request 
        elif "exception" in each[0]:
            exceptions.append(each[1])

        #content not found
        elif "notfound" in each[0]:
            content_not_found_urls.append(each[1])

    
    #dump html responses
    pickle.dump(html_resp,open(HtmlResponses+fname,"wb"))

    
    #store all the data in respective pickle file
    pickle.dump(air_quality_articles,open(Airquality+fname,"wb")) 
    pickle.dump(content_not_found_urls,open(ContentNotFound+fname,"wb")) 
    pickle.dump(exceptions,open(Exceptions+fname,"wb"))

regex matching string:
 air\spollution | air\squality | aqi | carbon\smonoxide | pm2.5 | pm10 | ozone | pm\s2.5 | pm\s10 | stubble\sburning | smog | sulphur\sdioxide | nitrogen\sdioxide | so2 | air\spollutants | acid\srain | crop\sburning | pm2·5 | odd\seven | airpollution | car\semissions | oddeven | airquality


### Scrapping of Articles from start date to end date

In [4]:
#number of cpus
n_cpu = mp.cpu_count()

#pool for multiprocessing
pool = mp.Pool(processes=n_cpu)

#enter start date and end date
start_date = datetime(2021,1,1)
end_date = datetime(2021,1,3)
home_page = []

#distance between two dates
delta = timedelta(days=1)
start = time.time()

while start_date <= end_date:
    start1 = time.time()
    
    #get articles' page response
    try:
        resp = get_calender_day_response(start_date)  
    except Exception as e:
        print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Home Page Exception @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
        print(traceback.format_exc())
        home_page.append([start_date,str(type(e)),traceback.format_exc()])
    
    #extract only article's urls from article page
    urls =  get_articles_urls(resp)
    total_urls = len(urls)
    print(f"Date = {str(start_date)}, total urls = {total_urls}")
    
    
    fname = str(start_date.date())
    try:
        additional_info = pickle.load(open(ArticleData+"additional_info","rb"))
    except FileNotFoundError:
        additional_info = {}
        pickle.dump(additional_info,open(ArticleData+"additional_info","wb"))
    
    additional_info[fname] = {}
    additional_info[fname]["start_timestamp"] = str(datetime.today())  
    additional_info[fname]["total"] = total_urls
    
    #scrap content of each articles using multiprocessing
    articles = pool.map(scrape_airquality_article,urls[:10])   
    
    #additional info
    additional_info[fname]["end_timestamp"] = str(datetime.today())
    pickle.dump(additional_info,open(ArticleData+"additional_info","wb"))
    
    store_article_data(articles,fname)

    
    end1 = time.time()
    print(f"-------------------------{end1 - start1} seconds - {str(start_date)}----------------------------")
    start_date += delta
    #time.sleep(10)

    
end = time.time()
print((end - start)/3600)

Date = 2021-01-01 00:00:00, total urls = 501
-------------------------2.339319944381714 seconds - 2021-01-01 00:00:00----------------------------
Date = 2021-01-02 00:00:00, total urls = 501
-------------------------2.237348794937134 seconds - 2021-01-02 00:00:00----------------------------
Date = 2021-01-03 00:00:00, total urls = 501
-------------------------2.0526089668273926 seconds - 2021-01-03 00:00:00----------------------------
1626143035.603887


In [5]:
print(home_page)

[]


### Scrapping for  Exceptions
* beacause some times url is valid but due to networking issue, content can not be scrappped so by doing again scrapping for that urls can scrap content for most of urls


In [1]:
#number of cpus
n_cpu = mp.cpu_count()

#pool for multiprocessing
pool = mp.Pool(processes=n_cpu)
for year in range(2021,2022):
    start = datetime(year,1,1).date()
    end = datetime(year,12,31).date()
    delta = timedelta(days=1)
    json_resp = []
    while start<=end:
        try:
            json = pd.read_pickle(Exceptions+str(start))
        except FileNotFoundError:
            start+=delta
            continue
        start+=delta  
        json_resp.extend(json)
        
    print(f"{len(json_resp)} urls in year-{year}")
    #re scrape all articles
    articles = pool.map(scrape_airquality_article,json_resp)
    
    #store articles if found
    store_article_data(articles,f"re_{str(start)}_{str(end-delta)}")
    