<h1><strong>Collecting data from <a src = 'https://eprateekdaily.com/'>Prateek Daily Newspaper</strong></a></h1>

## Importing Libraries

In [1]:
import re
import json

import numpy as np
import pandas as pd

from tqdm import tqdm
from time import sleep

import requests
from bs4 import BeautifulSoup as soup

## Defining Helper Functions-1

In [2]:
def get_request_object(page_url):
    return requests.get(page_url)

def get_soup_object(req_object):
    return soup(req_object.content, 'lxml')

In [3]:
def extract_news_link(a_soup):

    news_links_ = []
    for link in a_soup.find_all('a'):
        if re.search(f'{NEWS_LINK_REGEX_FORMAT}', link['href']):
            news_links_.extend(re.findall(f'{NEWS_LINK_REGEX_FORMAT}', link['href']))
            
    return list(set(news_links_))

## Defining constants

In [4]:
WAIT = 1
NEWS_LINK_REGEX_FORMAT = r'https://eprateekdaily.com/\d+/\d+/\d+/\d+'
MAJOR_NEWS_URL = 'https://eprateekdaily.com/category/%E0%A4%B8%E0%A4%AE%E0%A4%BE%E0%A4%9A%E0%A4%BE%E0%A4%B0/%E0%A4%AA%E0%A5%8D%E0%A4%B0%E0%A4%AE%E0%A5%81%E0%A4%96-%E0%A4%B8%E0%A4%AE%E0%A4%BE%E0%A4%9A%E0%A4%BE%E0%A4%B0/'
LOCAL_NEWS_URL = 'https://eprateekdaily.com/category/%e0%a4%b8%e0%a4%ae%e0%a4%be%e0%a4%9a%e0%a4%be%e0%a4%b0/%e0%a4%b8%e0%a5%8d%e0%a4%a5%e0%a4%be%e0%a4%a8%e0%a5%80%e0%a4%af/'

## Defining Helper Functions-2

In [5]:
def get_news_category_text(PAGE_URL, MAX, file_name):
    
    news_links_dict = {}
    category_text = None

    for page_no in tqdm(range(1, MAX+1)):

        page_URL = PAGE_URL + f'page/{page_no}/'

        request_object = get_request_object(page_URL)

        if request_object.status_code != 200:
            raise requests.exceptions.ConnectionError(f'Expects response code 200, but recieved {request_object}')

        soup_obj = get_soup_object(request_object)

        if category_text == None:
            try: category_text = soup_obj.find('h1').text.strip()
            except: pass
        try:
            if page_no == 1:
                top_news_box = soup_obj.find('div', {'id' : 'tdi_60'}).find('div', {'class' : 'wpb_wrapper'})
                news_links_dict[0] = extract_news_link(top_news_box)
                news_links_dict[page_no] = extract_news_link(soup_obj.find('div', {'id' : 'tdi_72'}))
            else:
                news_links_dict[page_no] = extract_news_link(soup_obj.find('div', {'id' : 'tdi_72'}))
        except:
            pass

        with open(f'{file_name}_news_links.txt', 'w', encoding = 'utf-8') as f:
            f.write(str(news_links_dict))

        sleep(WAIT)
        
    return category_text, news_links_dict

## Collecting links of 2 category NEWS

In [6]:
category_text_major, major_news_list = get_news_category_text(MAJOR_NEWS_URL, 88, 'major')

100%|██████████| 88/88 [03:46<00:00,  2.58s/it]


In [7]:
category_text_local, local_news_list = get_news_category_text(LOCAL_NEWS_URL, 666, 'local')

100%|██████████| 666/666 [47:36<00:00,  4.29s/it]


## Defining Helper Functions-3

In [8]:
def news_scraper(news_dict, news_category, new_file_name):

    news_collection = pd.DataFrame({'Title' : [], 'Category' : [], 'Auther' : [], 'Date' : [], 'Location' : [], 'Content' : [], 'URL' : []})
    failed_links = []

    for key in tqdm(sorted(news_dict.keys())):
        for news_link in news_dict[key]:

            try:
                request_object = get_request_object(news_link)

                soup_obj = get_soup_object(request_object)

                try: news_title = soup_obj.find('h1').text.strip()
                except: news_title = np.nan

                try: auther = soup_obj.find('div', {'class' : 'td-post-author-name'}).find('a').text.strip()
                except: auther = np.nan

                try: news_date = soup_obj.find('span', {'class' : 'td-post-date'}).text.strip()
                except: news_date = np.nan

                try:
                    content_box = soup_obj.find('div', {'class' : 'td-post-content tagdiv-type'})
                    try: reported_location = content_box.find('p').text.split(',')[1].strip()
                    except: reported_location= np.nan

                    try: content = '. '.join([re.sub(r'\xa0|\n|\t', '', para.text).strip() for idx, para in enumerate(content_box.find_all('p')) if idx != 0]).strip()
                    except: content = np.nan

                except:
                    reported_location, content = np.nan, np.nan

            except:
                news_title, auther, news_date, reported_location, content, news_link = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
                
                failed_links.extend(news_link)
            finally:
                news_collection.loc[len(news_collection)] = [news_title, news_category, auther, news_date, reported_location, content, news_link]

                with open(f'{new_file_name}_failed_links.txt', 'w', encoding = 'utf-8') as f:
                    f.write(str(failed_links))

            news_collection.to_csv(f'{new_file_name}_collections.csv', index = False, encoding = 'utf-8')
            news_collection.to_json(f'{new_file_name}_collections.json')

            sleep(WAIT)
        print(key)
    return news_collection

## Scrapping and processing collected NEWS

In [9]:
def content_get(value):
    if isinstance(value, str):
        if len(value) < 1 : return np.nan
    return value

In [10]:
major_news_df = news_scraper(major_news_list, category_text_major, 'major_news')

In [11]:
major_news_df.Content = major_news_df.Content.apply(content_get)
major_news_df = major_news_df.drop_duplicates(subset = 'URL')

major_news_df[major_news_df['Content'].isnull()].reset_index(drop = True).to_json('eprateekdaily_major_news_conetent_missing.json')
major_news_df = major_news_df.dropna(how = 'any', subset = 'Content').reset_index(drop = True)
major_news_df.to_json('eprateekdaily_major_news_data.json')

print(f'Data collection size : {major_news_df.shape}\n')
major_news_df.head()

Data collection size : (806, 7)



Unnamed: 0,Title,Category,Auther,Date,Location,Content,URL
0,"पर्साका शिक्षकहरू वीरगंजमा ओइरिए, -यालीपछि धर्ना",प्रमुख समाचार,प्रतीक दैनिक,"साउन ३१, २०८०",वीरगंज,नेपाल शिक्षक महासङ्घ केन्द्रले मस्यौदाको विरोध...,https://eprateekdaily.com/2023/08/16/55307
1,मिटरब्याजीमा राजनीतिक हस्तक्षेप बढ्दो,प्रमुख समाचार,प्रतीक दैनिक,"साउन ३०, २०८०",वीरगंज,राजनीतिका कारण मुख्य दोषीहरू विभिन्न बहाना बना...,https://eprateekdaily.com/2023/08/15/55253
2,शिक्षा ऐनको विरोधमा शिक्षकद्वारा पालिका कार्या...,प्रमुख समाचार,प्रतीक दैनिक,"साउन २९, २०८०",वीरगंज,"नेपाल शिक्षक महासङ्घ, पर्साका अध्यक्ष प्रदीप ज...",https://eprateekdaily.com/2023/08/14/55191
3,निर्माणाधीन दुधौरा पुलको डाइभर्सनमा विभाग र ठे...,प्रमुख समाचार,प्रतीक दैनिक,"साउन ३२, २०८०",निजगढ,पूर्व–पश्चिम महेन्द्र राजमार्ग अन्तर्गतका सडकख...,https://eprateekdaily.com/2023/08/17/55355
4,बाढीले फास्ट ट्र्याकको पुल निर्माण गर्ने मेशिन...,प्रमुख समाचार,प्रतीक दैनिक,"भदौ १, २०८०",निजगढ,दु्रतमार्गको जसपालस्थित ८ नम्बर पुलको पाइलिङ म...,https://eprateekdaily.com/2023/08/18/55399


In [12]:
local_news_df = news_scraper(local_news_list, category_text_local, 'local_news')

In [13]:
local_news_df.Content = local_news_df.Content.apply(content_get)
local_news_df = local_news_df.drop_duplicates(subset = 'URL')

local_news_df[local_news_df['Content'].isnull()].reset_index(drop = True).to_json('eprateekdaily_local_news_conetent_missing.json')
local_news_df = local_news_df.dropna(how = 'any', subset = 'Content').reset_index(drop = True)
local_news_df.to_json('eprateekdaily_local_news_data.json')

print(f'Data collection size : {local_news_df.shape}\n')
local_news_df.head()

Data collection size : (5463, 7)



Unnamed: 0,Title,Category,Auther,Date,Location,Content,URL
0,परीक्षा पुस्तिका साट्ने दुई परीक्षा प्रमुख पक्राउ,स्थानीय,प्रतीक दैनिक,"साउन ३२, २०८०",वीरगंज,"वप्रका, बिर्ताका प्रहरी निरीक्ष्Fक विदुर शिवाक...",https://eprateekdaily.com/2023/08/17/55353
1,सिसी क्यामरा निगरानी कक्ष उद्घाटन,स्थानीय,प्रतीक दैनिक,"साउन ३२, २०८०",परवानीपुर,"जिल्ला प्रहरी कार्यालय, पर्सामा सिसी क्यामरा न...",https://eprateekdaily.com/2023/08/17/55350
2,अन्तर्घात हुँदा गठबन्धन –सहमहामन्त्री यादव,स्थानीय,प्रतीक दैनिक,"भदौ १, २०८०",वीरगंज,नेकाको १४औं महाधिवेशनपश्चात् नेका पर्सा क्षेत्...,https://eprateekdaily.com/2023/08/18/55396
3,पारिश्रमिक वृद्धिको माग गर्दै युनियनद्वारा आन्...,स्थानीय,प्रतीक दैनिक,"भदौ १, २०८०",वीरगंज,अध्यक्ष हेमराज न्यौपाने र सचिव प्रेम चौधरीले ज...,https://eprateekdaily.com/2023/08/18/55394
4,मोटरसाइकल दुर्घटनामा बालकको मृत्यु,स्थानीय,प्रतीक दैनिक,"भदौ १, २०८०",पर्सागढी,वीरगंज महानगरपालिका–१ छपकैया निवासी जयनाथ सिंह...,https://eprateekdaily.com/2023/08/18/55392


## Combbining and saving final file

In [14]:
eprateekdaily_final_df = pd.concat([major_news_df, local_news_df]).reset_index(drop = True)
eprateekdaily_final_df.to_json('eprateekdaily_news_data.json')

print(f'eprateekdaily final data collection size : {eprateekdaily_final_df.shape}\n')
eprateekdaily_final_df.head()

eprateekdaily final data collection size : (6269, 7)



Unnamed: 0,Title,Category,Auther,Date,Location,Content,URL
0,"पर्साका शिक्षकहरू वीरगंजमा ओइरिए, -यालीपछि धर्ना",प्रमुख समाचार,प्रतीक दैनिक,"साउन ३१, २०८०",वीरगंज,नेपाल शिक्षक महासङ्घ केन्द्रले मस्यौदाको विरोध...,https://eprateekdaily.com/2023/08/16/55307
1,मिटरब्याजीमा राजनीतिक हस्तक्षेप बढ्दो,प्रमुख समाचार,प्रतीक दैनिक,"साउन ३०, २०८०",वीरगंज,राजनीतिका कारण मुख्य दोषीहरू विभिन्न बहाना बना...,https://eprateekdaily.com/2023/08/15/55253
2,शिक्षा ऐनको विरोधमा शिक्षकद्वारा पालिका कार्या...,प्रमुख समाचार,प्रतीक दैनिक,"साउन २९, २०८०",वीरगंज,"नेपाल शिक्षक महासङ्घ, पर्साका अध्यक्ष प्रदीप ज...",https://eprateekdaily.com/2023/08/14/55191
3,निर्माणाधीन दुधौरा पुलको डाइभर्सनमा विभाग र ठे...,प्रमुख समाचार,प्रतीक दैनिक,"साउन ३२, २०८०",निजगढ,पूर्व–पश्चिम महेन्द्र राजमार्ग अन्तर्गतका सडकख...,https://eprateekdaily.com/2023/08/17/55355
4,बाढीले फास्ट ट्र्याकको पुल निर्माण गर्ने मेशिन...,प्रमुख समाचार,प्रतीक दैनिक,"भदौ १, २०८०",निजगढ,दु्रतमार्गको जसपालस्थित ८ नम्बर पुलको पाइलिङ म...,https://eprateekdaily.com/2023/08/18/55399
