In [220]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import re

In [210]:
def moneyScrapper(url):
    '''
    moneyScrapper -> function scraps data from the money control website

    Arguments
        url - input url to the webpage to be scraped

    Variables
        page -> GET request response
        soup -> BeautifulSoup object to get the HTML structure of the webpage

        heading -> heading of the extracted article
        time_stamp -> data and time of the posting of the article
        image_source -> url of the image posted along the article
        text -> extracted text of the article, from the <p> tags
        article_content -> string of all text from different <p> tags added together

    Return
        a list containing heading, time_stamp, image_source, article_content, url of webpage
    '''


    print('[NOTE] Scraping data from Money Control........')

    page = requests.get(url)
    soup = bs(page.content, 'html.parser')
    
    #article heading 
    heading = soup.h1.getText()
    
    #article posting data and time
    time_stamp = re.sub('/', ',', soup.find('div', {'class':'article_schedule'}).get_text())
    
    #article image url
    image_source = soup.find('div', {'class':'article_image'})
    image_source = image_source.find('img').get('data-src')
    
    #article content and text
    article_content = ''
    texts = soup.find_all('p')
    texts = texts[67:]
    for text in texts:
        article_content = article_content + (text.getText())

    # list of extracted information
    return [heading, time_stamp, image_source, article_content, url]

In [211]:
def economicScrapper(url):
    '''
    economicScrapper -> function scraps data from the economic times website

    Arguments
        url - input url to the webpage to be scraped

    Variables
        page -> GET request response
        soup -> BeautifulSoup object to get the HTML structure of the webpage

        heading -> heading of the extracted article
        time_stamp -> data and time of the posting of the article
        image_source -> url of the image posted along the article
        text -> extracted text of the article, from the <p> tags
        article_content -> string of all text from different <p> tags added together

    Return
        a list containing heading, time_stamp, image_source, article_content, url of webpage
    '''

    print('[NOTE] Scrapping data from Economic times......')

    page = requests.get(url)
    soup = bs(page.content, 'html.parser')

    #article heading
    heading = soup.h1.getText()

    #article posting date and time
    time_stamp = re.split(':', soup.find('time').get_text(), 1)
    time_stamp = time_stamp[1]
    time_stamp = re.sub('Oct', 'October', time_stamp)       #time_stamp modified in accordance with money control website time_stamp format
    
    #article image
    image_source = '_'                  #economics times articles had no images associated with them
    
    #article content and text
    article_content = soup.find('div', {'class':'artText'})
    article_content = article_content.get_text()

    #list of extracted data
    return [heading, time_stamp, image_source, article_content, url]

In [214]:
def scraper(urls_df):
    '''
    scraper -> function scrapes the data from a given list of urls

    Arguments
        urls_df -> list of urls of websites to be extracted

    Variables
        money_control -> return a regular expression object for the sequence moneycontrol
        economic_times -> return a regular expression object for the sequence economictimes

        output_list -> output list of extracted data from the website
    
    Return
        returns the output_list

    '''
    

    money_control = re.compile(r'moneycontrol')
    economic_times = re.compile(r'economictimes')

    output_list = []

    
    for url in urls_df:

        #check if the given url is from money control
        if money_control.search(url):
            output_list.append(moneyScrapper(url))

        #check if the given url is from money control
        else:
            output_list.append(economicScrapper(url))

    return output_list


In [218]:
if __name__ == '__main__':

    # read the csv file with the given urls
    urls_df = pd.read_csv('/home/karandeep/Downloads/web_scraping/urls.csv')

    #convert the dataframe to list
    urls_df = list(urls_df['web_urls'])

    #extract the data in the form of a list
    output_list = scraper(urls_df)

    #create a dataframe of the extraceted data
    output_data = pd.DataFrame(output_list, columns=['heading', 'time_stamp', 'image_urls', 'article_content', 'source_url'])

    # save the extracted data to a csv file
    output_data.to_csv('scraped_data.csv')

[NOTE] Scraping data from Money Control........
[NOTE] Scraping data from Money Control........
[NOTE] Scraping data from Money Control........
[NOTE] Scraping data from Money Control........
[NOTE] Scraping data from Money Control........
[NOTE] Scrapping data from Economic times......
[NOTE] Scrapping data from Economic times......
[NOTE] Scrapping data from Economic times......
[NOTE] Scrapping data from Economic times......
[NOTE] Scrapping data from Economic times......
