In [12]:
# !pip install newspaper3k

In [1]:
import requests
import bs4
import urllib3
import time
from datetime import datetime, timedelta
import re
import pandas as pd
import numpy as np
from pandas import json_normalize
import multiprocessing as mp
import traceback
import pickle
import psutil
import time
from newspaper import Article
import os
from newspaper import Config
config = Config()
config.request_timeout = 60

import sys
sys.setrecursionlimit(10000)


from pathlib import Path
ArticleData = "ArticlesData/Hindu/"
Path(ArticleData).mkdir(parents=True, exist_ok=True)

Airquality = ArticleData+"Airquality/"
Path(Airquality).mkdir(parents=True, exist_ok=True)

ContentNotFound = ArticleData+"ContentNotFound/"
Path(ContentNotFound).mkdir(parents=True, exist_ok=True)

Exceptions = ArticleData+"Exceptions/"
Path(Exceptions).mkdir(parents=True, exist_ok=True)

HtmlResponses = ArticleData + "HtmlResponses/"
Path(HtmlResponses).mkdir(parents=True, exist_ok=True)


HomeExceptions = ArticleData + "HomeExceptions/"
Path(HomeExceptions).mkdir(parents=True, exist_ok=True)

import logging
import sys

LOGGING_ENABLED = False
if LOGGING_ENABLED:
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
    logger = logging.getLogger()
    file_handler = logging.FileHandler('hindu_2.log', 'a')
    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
    logger.addHandler(file_handler)
    print = logger.info

### Hindu Helper Functions

In [2]:
#html response of the page where url are listed of specific date
def hindu_get_article_list_html_resp(datetime_obj):
    
    #make a dynamic link
    year = datetime_obj.year
    month = datetime_obj.month
    day = datetime_obj.day
    link = f"https://www.thehindu.com/archive/web/{year}/{month}/{day}/"
    
    #get html response
    try:
        resp = requests.get(link)
        resp_txt = resp.text
        return [resp_txt,datetime_obj]

    except Exception as e:
        print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Exception in Home Page @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
        print(traceback.format_exc())
        print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
        return ["exception",str(type(e)),traceback.format_exc()]
    
    
#scrape urls of articles,
def hindu_get_article_urls(html_resp,datetime_obj):
    resp_txt = html_resp
    scrapper = bs4.BeautifulSoup(resp_txt,"html.parser")
    
    #matcher of sections which conatains urls of articles
    section_matcher = re.compile("section_[0-9]+",flags = re.I)
    
    #find sections
    sections = scrapper.find_all("section",attrs={"id":section_matcher})
    
    #print(f"sections = {len(sections)}")
    url_data = []
    for ind,section in enumerate(sections):
        #category of article i.e. sports, pollution
        category = section.find("a",attrs={"class":"section-list-heading"}).text.strip()
        #print(f"{ind+1} - {category}")
        #article urls
        for url in section.find("ul",class_ = "archive-list").find_all("a"):
            url_data.append({"url":url.get("href"),"title":url.text,"date":datetime_obj,"category":category,"media":"hindu"})
        
    return url_data
    
#scrape author & city from article
def hindu_get_author_and_city(html_resp):
    soap = bs4.BeautifulSoup(html_resp,"html.parser")
    author_section = soap.find("div",class_="author-container hidden-xs")
    author = author_section.find("span",class_="author-img-name 1")
    if author:
        author = author.text.strip()
        #remove white-space & comma
        author = re.sub(r"[,*\s*\n*]","",author)
    else:
        author = None
    place_time = author_section.find("div",class_ = "ut-container").find_all("span")
    #print(author,place_time[0].text.strip())
    city = None
    if len(place_time) >= 3:
        city = place_time[0].text.strip()
        #remove white-space & comma
        city = re.sub(r"[,*\s*\n*:*]","",city)
    return [author,city]      
    


### Common Functions between TOI & Hindu

In [4]:
def get_regex_string_from_single_word(keyword,space=False,dostrip=True):
    
    #to check empty string before the keyword
    re_str = ""
    space = ""
    if space:
        space = "\\b"
        re_str+=space
    keyword = keyword.lower()
    if dostrip:
        keyword = keyword.strip()
    words = keyword.split(" ")
    n = len(words)
    for ind,word in enumerate(words):
        re_str+=word
        if ind+1 < n:
            #to accept white space between two word
            re_str+="\\s"
    
    #to check empty string after the keyword
    re_str+= space
    return re_str


keyword_df = pd.read_csv("airpollution_keywords.csv")
keyword_df["regex_string"] = keyword_df["keywords"].apply(get_regex_string_from_single_word)
keyword_df["keywords"] = keyword_df["keywords"].apply(lambda x:x.lower())
keywords_count = dict(zip(keyword_df.keywords, [0]*len(keyword_df.keywords)))
#get regex matching string
keywords_match_string = ""
for each in keyword_df["regex_string"]:
    keywords_match_string+=each
    keywords_match_string+=" | "
keywords_match_string = keywords_match_string[:-3]
print("regex matching string:\n",keywords_match_string)


def get_air_quality_keywords(text,match_string):
    matcher = re.compile(match_string,flags=re.I | re.X)
    matches = matcher.findall(text)
    
    #update count in dict
    count = {}
    for each in matches:
        key = each.strip().lower()
        try:
            count[key]+=1
        except:
            count[key] = 1
    return count

def scrape_airquality_article(url_data):
    if "//ads." in url_data["url"]:
        return ["aid","no macthes"]
    author = ""
    category = ""
    city = ""
    try:
        url,title,datetime_obj = url_data["url"].strip(), url_data["title"], url_data["date"]
        if "category" in url_data:
            category = url_data["category"]
        #newspaper3k library
        article = Article(url,config = config)
        article.download()
        article.parse()
        
        #article text not found
        if len(article.text.strip()) < 10:
            return ["notfound",url_data]
        #airuality related keywords
        matches = get_air_quality_keywords(article.text,keywords_match_string)

        articles_json = {}
        #atleast one keyword found
        if len(matches) > 0:
            print(f"Found in {category}\n{title}")
            #print(url)
            
            #get author & city
            if url_data["media"] == "hindu":
                author, city = hindu_get_author_and_city(article.html)
            if url_data["media"] == "TOI":
                author, city = TOI_get_author_and_city(article.html)
            #print(f"Author = {author}\nCity = {city}")       
            articles_json = {"date":datetime_obj.date(),
                        "url":url,
                        "heading":title,
                        "content":article.text,
                        "other":{"author":author,"city":city,"top_image":article.top_image,
                                 "category":category,"keywords":article.meta_keywords,
                                 "tags":list(article.tags)},
                        "matches":matches
                        }
            return [articles_json,article.html]
        return [articles_json,"No macthes"]
    
    except Exception as e:
       
        url_data["exception_title"] = str(type(e))
        url_data["exception_info"] = traceback.format_exc()
        print("****************************************Exception***********************************************")
        print(traceback.format_exc())
        print("***************************************************************************************")
        return ["exception",url_data]
    
def store_article_data(articles,fname):
    #load content_not_found_urls [list]
    content_not_found_urls = []
    
    #load data of air_quality_articles [list]
    air_quality_articles = []
    
    #html responses
    html_resp = []

    #load urls where exceptions were occured
    exceptions = []

    
    for each in articles:
        #if it is air quality related article
        if type(each[0]) == dict and each[0] != {}:
            #store structured json scrapped data
            air_quality_articles.append(each[0])

            #store html_text_response of page
            html_json = {"heading":each[0]["heading"],"html":each[1]}
            html_resp.append(html_json)

        #exception occured in get request 
        elif "exception" in each[0]:
            print("stored Exception")
            exceptions.append(each[1])

        #content not found
        elif "notfound" in each[0]:
            content_not_found_urls.append(each[1])

    
    #dump html responses
    pickle.dump(html_resp,open(HtmlResponses+fname,"wb"))

    
    #store all the data in respective pickle file
    pickle.dump(air_quality_articles,open(Airquality+fname,"wb")) 
    pickle.dump(content_not_found_urls,open(ContentNotFound+fname,"wb")) 
    pickle.dump(exceptions,open(Exceptions+fname,"wb"))

regex matching string:
 air\spollution | air\squality | aqi | carbon\smonoxide | pm2.5 | pm10 | ozone | pm\s2.5 | pm\s10 | stubble\sburning | smog | sulphur\sdioxide | nitrogen\sdioxide | so2 | air\spollutants | acid\srain | crop\sburning | pm2·5 | odd\seven | airpollution | car\semissions | oddeven | airquality


### Scrapping 

In [5]:
#number of cpus
n_cpu = mp.cpu_count()

#pool for multiprocessing
pool = mp.Pool(processes=n_cpu)

start_date = datetime(2021,1,1)
end_date = datetime(2021,1,3)

#distance between two dates
delta = timedelta(days=1)

start = time.time()

while start_date <= end_date:
    start1 = time.time()
    
    #additional info
    fname = str(start_date.date())
    try:
        additional_info = pickle.load(open(ArticleData+"additional_info","rb"))
    except FileNotFoundError:
        additional_info = {}
        
    additional_info[fname] = {}
    additional_info[fname]["start_timestamp"] = str(datetime.today())
    
    #get articles' page response
    status = hindu_get_article_list_html_resp(start_date)
    if status[0] == "exception":
        _,exception_title,exception_info = status
        pickle.dump([start_date,exception_title,exception_title],open(HomeExceptions+fname,"wb"))
    else:
        resp, date = status
        
    
    #extract only article's urls from article page
    urls =  hindu_get_article_urls(resp,start_date)
    total_urls = len(urls)
    print(f"Date = {str(start_date)}, total urls = {total_urls}")
        
    #scrap content of each articles using multiprocessing
    articles = pool.map(scrape_airquality_article,urls[:10]) 
    
    #additional info
    additional_info[fname]["total"] = total_urls
    additional_info[fname]["end_timestamp"] = str(datetime.today())
    pickle.dump(additional_info,open(ArticleData+"additional_info","wb"))
        
    #store necessary data
    store_article_data(articles,fname)
    
    
    end1 = time.time()
    print(f"-------------------------{end1 - start1} seconds - {str(start_date)}----------------------------")
    start_date += delta
    #time.sleep(10)

    
end = time.time()
print(end - start)

Date = 2021-01-01 00:00:00, total urls = 431
-------------------------9.862453699111938 seconds - 2021-01-01 00:00:00----------------------------
Date = 2021-01-02 00:00:00, total urls = 386
-------------------------9.360830068588257 seconds - 2021-01-02 00:00:00----------------------------
Date = 2021-01-03 00:00:00, total urls = 323
-------------------------8.510723352432251 seconds - 2021-01-03 00:00:00----------------------------
27.735363245010376


### Scrapping for  Exceptions
* beacause some times url is valid but due to networking issue, content can not be scrappped so by doing again scrapping for that urls can scrap content for most of urls


In [17]:
#number of cpus
n_cpu = mp.cpu_count()

#pool for multiprocessing
pool = mp.Pool(processes=n_cpu)
for year in range(2011,2020):
    start = datetime(year,1,1).date()
    end = datetime(year,12,31).date()
    delta = timedelta(days=1)
    json_resp = []
    while start<=end:
        try:
            json = pd.read_pickle(Exceptions+str(start))
        except FileNotFoundError:
            start+=delta
            continue
            
        json_resp.extend(json)
        start+=delta
    print(f"{len(json_resp)} urls in year-{year}")
    articles = pool.map(scrape_airquality_article,json_resp)
    store_article_data(articles,f"re_{str(start)}_{str(end)}")

2021-05-31 00:07:48,446 - 0 urls in year-2011
2021-05-31 00:07:48,477 - 0 urls in year-2012
2021-05-31 00:07:48,502 - 0 urls in year-2013
2021-05-31 00:07:48,528 - 2 urls in year-2014
2021-05-31 00:07:48,990 - 0 urls in year-2015
2021-05-31 00:07:49,018 - 0 urls in year-2016
2021-05-31 00:07:49,053 - 0 urls in year-2017
2021-05-31 00:07:49,080 - 66 urls in year-2018
2021-05-31 00:07:49,908 - Found in delhi
Delhi’s air quality continues to be in ‘poor’ category 
2021-05-31 00:07:51,359 - 0 urls in year-2019


In [18]:
# 10226.08913898468/60/60