In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
session = requests.Session()

session.headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
    "Accept-Encoding": "gzip, deflate",
    "Accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/png,*/*;q=0.8",
    "Accept-Language": "en"}

In [25]:
def get_articles_html(term: str = "cattle", pages=1000):
    query_post_data: dict = {
        'query':
        f'{{"sector":"{term}","error":"","m":"","p":0,"post_parent":"","subpost":"","subpost_id":"","attachment":"","attachment_id":0,"name":"","pagename":"","page_id":0,"second":"","minute":"","hour":"","day":0,"monthnum":0,"year":0,"w":0,"category_name":"","tag":"","cat":"","tag_id":"","author":"","author_name":"","feed":"","tb":"","paged":0,"meta_key":"","meta_value":"","preview":"","s":"","sentence":"","title":"","fields":"","menu_order":"","embed":"","category__in":[],"category__not_in":[],"category__and":[],"post__in":[],"post__not_in":[],"post_name__in":[],"tag__in":[],"tag__not_in":[],"tag__and":[],"tag_slug__in":[],"tag_slug__and":[],"post_parent__in":[],"post_parent__not_in":[],"author__in":[],"author__not_in":[],"ignore_sticky_posts":false,"suppress_filters":false,"cache_results":true,"update_post_term_cache":true,"lazy_load_term_meta":true,"update_post_meta_cache":true,"post_type":"","posts_per_page":130,"nopaging":false,"comments_per_page":"50","no_found_rows":false,"taxonomy":"sector","term":"{term}","order":"DESC"}}',
        'action': 'loadmore',
        'page': '',}
    url: str = "https://www.ifa.ie/wp-admin/admin-ajax.php/"

    page_number: int = 0
    html: str = ""
    receiving_data: bool = True
    while receiving_data and page_number < pages:
        query_post_data['page'] = page_number
        text: str = session.post(url, query_post_data).text
        if not text: receiving_data = False
        else: html += text
        page_number += 1

    print(f"Downloaded {page_number} pages.")
    print(html)
    return html

In [17]:
def get_articles_links(html: str):
    beautiful_soup = BeautifulSoup(html, 'html.parser')
    links = beautiful_soup.find_all("a", {"class": ""}, href=True)
    print("Article Links found:", len(links))
    return links

In [18]:
def download_articles_from_links(links, term: str):
    pageList: list = []

    for link in links:
        url: str = link['href']
        response: str = session.get(url)
        beautiful_soup = BeautifulSoup(response.content, 'html.parser')
        heading = beautiful_soup.find('h1').text
        date = beautiful_soup.find('time').text
        html_content = beautiful_soup.find("div", {"class": "single-content"})
        pageList.append([url, heading, date, term, html_content.text, html_content])
    return pageList

In [19]:
def append_articles_to_csv(pageList, filename):
    
    columns = ["URL", "Heading", "Date", "HTML Content", "Text", "Trend"]
    
    df = pd.DataFrame(pageList, columns=columns)
    df.to_csv(filename, index=False, header=False, mode='a')
    print("Data saved {}".format(filename))

In [20]:
def create_articles_csv_file(filename: str):
    dataframe_columns = [
        "URL", "Heading", "Date", "Trend", "Text", "HTML Content"]
    dataframe = pd.DataFrame(columns=dataframe_columns)
    dataframe.to_csv(f'{filename}', index=False)
    print(f"Created assets/{filename}")

In [21]:
def download_articles(term: str, filename: str) -> None:
    html = get_articles_html(term)
    links = get_articles_links(html)
    page_list = download_articles_from_links(links, term)
    append_articles_to_csv(page_list, filename)

In [26]:
filename = "ifa-ie-articles.csv"
create_articles_csv_file(filename)
terms = ["sheep", "pig","cattle"]
for term in terms:
    download_articles(term, filename)

Created assets/ifa-ie-articles.csv
Downloaded 42 pages.

<div class="col-6 col-lg-3">
    <div class="card card-half">
        <a href="https://www.ifa.ie/markets-and-prices/beef-sheep-update-6th-may/">
            <img width="370" height="210" src="https://www.ifa.ie/wp-content/uploads/2019/04/Cattle1232-370x210.jpg" class="attachment-card size-card wp-post-image" alt="Cattle and Sheep in Wickow Field" loading="lazy" sizes="100vw" />
            <h4 class="post-title">Beef &#038; Sheep Update 6th May</h4>
        </a>
        <a href="https://www.ifa.ie/market-reports/beef-sheep-update/" class="main-term archive-link">Beef &amp; Sheep Update</a>    </div>
</div>
<div class="col-6 col-lg-3">
    <div class="card card-half">
        <a href="https://www.ifa.ie/farm-sectors/action-needed-as-sheep-sector-exposed-on-inputs-crisis/">
            <img width="370" height="210" src="https://www.ifa.ie/wp-content/uploads/2019/04/Sheep-0987-370x210.jpg" class="attachment-card size-card wp-post-i

KeyboardInterrupt: 

In [49]:
#url = 'https://www.farmersjournal.ie/myjournal/newsfeed'
#url= 'https://www.ifa.ie/?s=beef+or+cattle+prices'
#url = 'https://www.rte.ie/search/query/farmers'
url = 'https://www.google.com/search?q=irish+farmers+journal+cattle+prices&sxsrf=ALiCzsarCyNKwWDjhjLhSAnbtxGxB-Irhw%3A1652107936757&source=hp&ei=oCp5YujMLIKT8gKKiKGABA&iflsig=AJiK0e8AAAAAYnk4sM8Q4-TzGs0cKDGH3Bpx7UVVk3-t&oq=irish+farmers+beef+prices&gs_lcp=Cgdnd3Mtd2l6EAMYADIGCAAQFhAeOgQIIxAnOgsIABCABBCxAxCDAToRCC4QgAQQsQMQgwEQxwEQ0QM6CAgAELEDEIMBOhEILhCABBCxAxCDARDHARCjAjoFCAAQgAQ6CAgAEIAEELEDOg4ILhCABBCxAxDHARCvAToLCC4QgAQQxwEQrwFQAFjeHmChOWgAcAB4AIABggGIAd4NkgEEMjMuMpgBAKABAQ&sclient=gws-wiz'

In [50]:
r = requests.get(url)
beautiful_soup = BeautifulSoup(r.text, 'html.parser')
links = beautiful_soup.find_all("a", {"class": ""}, href=True)
print(r.text)

<!DOCTYPE html><html lang="en" dir="ltr"><head><style nonce="3pJs0HCSqdLCDa6+liatSw">
a, a:link, a:visited, a:active, a:hover {
  color: #1a73e8;
  text-decoration: none;
}
body {
  font-family: Roboto,RobotoDraft,Helvetica,Arial,sans-serif;
  text-align: center;
  -ms-text-size-adjust: 100%;
  -moz-text-size-adjust: 100%;
  -webkit-text-size-adjust: 100%;
}
.box {
  border: 1px solid #dadce0;
  box-sizing: border-box;
  border-radius: 8px;
  margin: 24px auto 5px auto;
  max-width: 520px;
  padding: 24px;
}
h1 {
  color: #2c2c2c;
  font-size: 24px;
  hyphens: auto;
  margin: 24px 0;
}
p, .sub, .contentText {
  color: #5f6368;;
  font-size: 14px;
  line-height: 20px;
  letter-spacing: 0.2px;
  text-align: left;
}
.signin {
  text-align: right;
}
.image {
  display: block;
  margin: 14px auto;
}
.basebutton {
  border-radius: 4px;
  cursor: pointer;
  font-family: Roboto,RobotoDraft,Helvetica,Arial,sans-serif;
  font-size: 14px;
  font-weight: 500;
  height: 36px;
  margin: 12px 4px 0;


In [51]:
links

[<a href="https://policies.google.com/technologies/cookies?hl=en&amp;utm_source=ucb" target="_blank">cookies</a>,
 <a href="https://policies.google.com/privacy?hl=en&amp;utm_source=ucb">Privacy Policy</a>,
 <a href="https://policies.google.com/terms?hl=en&amp;utm_source=ucb">Terms of Service</a>]