<h1>Collecting NEWS from <a href = 'https://www.himalkhabar.com/'>HimalKhabar.com</a> :: A Complete Nepali Political News Portal</h1>
<img src = 'https://user-images.githubusercontent.com/83589431/261517022-c6063311-2b22-469e-8fde-9aee8bfe818f.png'>

## Importing Libraries

In [1]:
from bs4 import BeautifulSoup as soup
import requests
import re
from tqdm import tqdm
from time import sleep
import numpy as np
import pandas as pd

In [2]:
# Defining Constants

himalkhabar_main_portel = 'https://www.himalkhabar.com/category/samachar?page='
WAIT = 1

## Defining Helper Functiom

In [3]:
def get_request_object(page_url):
    return requests.get(page_url)

def get_soup_object(req_object):
    return soup(req_object.content, 'lxml')

## Scrapping : Generating news links

In [4]:
news_link_collection = {}

for page_no in tqdm(range(1, 728)):

    news_links_list = []

    request_object = get_request_object(himalkhabar_main_portel + str(page_no))

    if request_object.status_code != 200:
        raise requests.exceptions.ConnectionError(f'Expects response code 200, but recieved {request_object}')

    soup_obj = get_soup_object(request_object)
    news_container = soup_obj.find('div', {'id' : 'content'}).find('div', {'class' : 'container'})

    for links in news_container.find_all('a', href = True):
        if re.search('https://www.himalkhabar.com/news/', links['href']):
            news_links_list.extend(re.findall('https://www.himalkhabar.com/news/\d+', links['href']))

    news_link_collection[page_no] = list(set(news_links_list))

    with open('scrapped_data/link_collection.txt', 'w', encoding = 'utf-8') as f:
        f.write(str(news_link_collection))

    sleep(WAIT)

100%|██████████| 727/727 [51:37<00:00,  4.26s/it]


## Scrapping : Collecting NEWS texts

In [5]:
news_collection = pd.DataFrame({'Title' : [], 'Category' : [], 'Auther' : [], 'Date' : [], 'Content' : [], 'URL' : []})
failed_links = []

for key in tqdm(sorted(news_link_collection.keys())[:100]):
    for news_link in news_link_collection[key]:

        try:
            request_object = get_request_object(news_link)

            soup_obj = get_soup_object(request_object)

            article_soup = soup_obj.find('aside')

            try: news_category = article_soup.find('span', {'class' : 'cat-name'}).text.strip()
            except: news_category = np.nan

            try: news_title = article_soup.find('span', {'class' : 'news-big-title'}).text.strip()
            except: news_title = np.nan

            try:
                media_body = article_soup.find('div', {'class' : 'media-body'})
                try: auther = media_body.find('a').text.strip()
                except: auther= np.nan
                try: date_ = media_body.find_all('span')[-1].text.strip()
                except: autdate_her= np.nan
            except:
                auther, date_ = np.nan, np.nan

            try: news_content = '. '.join([re.sub(r'\xa0|\n|\t', '', para.text).strip() for para in article_soup.find('div', {'class' : 'detail-box'}).find_all('p')]).strip()
            except: news_content = np.nan

        except:
            news_category, news_title, auther, date_, news_content, news_link = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
            failed_links.extend(news_link)
        finally:
            news_collection.loc[len(news_collection)] = [news_title, news_category, auther, date_, news_content, news_link]

            with open('scrapped_data/failed_links.txt', 'w', encoding = 'utf-8') as f:
                f.write(str(failed_links))

        news_collection.to_csv('scrapped_data/news_collections.csv', index = False, encoding = 'utf-8')
        news_collection.to_json('scrapped_data/news_collections.json')

        sleep(WAIT)

100%|██████████| 100/100 [3:27:33<00:00, 124.54s/it]


<div class="alert alert-info" role="alert">
    <p><strong><u>NOTE :</u></strong></p>
    <ul>
        <li>We had about <strong>520+</strong> pages, splitted those in size of <strong>100</strong> and ran the above same code by chaning <strong>limit</strong> parameters to speed-up the scrappping process.</li>
        <li>Once collected all the news from variuos runs, combines them as shown in below to get the desired output.</li>
    </ul>
</div>

## Combining collected data files

In [6]:
# Listing all collected data : JSON files only

sorted(glob('scrapped_data/*.json'))

['scrapped_data/news_collections-0-100.json',
 'scrapped_data/news_collections-100-200.json',
 'scrapped_data/news_collections-200-300.json',
 'scrapped_data/news_collections-300-400.json',
 'scrapped_data/news_collections-400-500.json',
 'scrapped_data/news_collections-500-600.json',
 'scrapped_data/news_collections-600-700.json',
 'scrapped_data/news_collections-700+.json']

In [7]:
combined_df = pd.DataFrame()
for file in sorted(glob('scrapped_data/*.json')):
    temp_df = pd.read_json(file)
    temp_df.index = np.array(temp_df.index) + temp_df.shape[0]
    combined_df = pd.concat([combined_df, temp_df])

print(f'Combined data collection size : {combined_df.shape}')

Combined data collection size : (21805, 6)


In [8]:
combined_df.isnull().sum()

Title        0
Category     0
Auther      21
Date         0
Content     79
URL          0
dtype: int64

In [9]:
orig_size = combined_df.shape[0]
missing_size = combined_df.dropna(how = 'any').shape[0]
percent_missing = (orig_size-missing_size)/orig_size * 100

print(f'Number of totoal missing rows and % of missing : {orig_size-missing_size} & {percent_missing:.2f}%')

Number of totoal missing rows and % of missing : 100 & 0.46%


Since missing is only **0.46%**, intead of re-scrapping removing those rows and proceeding to next.

## Saving the combined datacollection to JSON

In [10]:
combined_df.dropna(how = 'any').reset_index(drop = True).to_json('HimalKhabar_news_collection.json')