# Web scraping article texts

In this notebook, I get a list of article urls and then iterate over it to get over 6000 article texts. Each text is then stored as a separate line in a csv file.

In [6]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import re
from numpy import random
from time import sleep
import math
import pandas as pd

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
headers={'User-Agent':user_agent,} 

## Functions for scraping

In [4]:
def get_soup_from_url(url):
    request = Request(url,None,headers) 
    response_soup = urlopen(request)
    soup = BeautifulSoup(response_soup, "html.parser")
    return soup

def get_links_from_soup(soup, key_phrase):
    hrefs = [link.get('href') for link in soup.findAll('a')]
    relevant_links = list(set([h for h in hrefs if h is not None and key_phrase in h]))
    return relevant_links

def get_links_from_url(url, key_phrase):
    soup = get_soup_from_url(url)
    links =  get_links_from_soup(soup, key_phrase)
    return links

def get_links_from_url_and_sleep(url, key_phrase):
    links = get_links_from_url(url, key_phrase)
    sleep(random.uniform(0, 2))
    return links

def get_all_links(url_beginning, max_page = 150, key_phrase = "/news/"):
    all_links = []
    for i in range(1, max_page+1):
        links = get_links_from_url_and_sleep(url_beginning+str(i), key_phrase)
        all_links += links
    return all_links

## Using lrt's API

In [13]:
import requests

def construct_url(page_nr, start_date, end_date, category_id):
    return "https://www.lrt.lt/api/search?page="+str(page_nr)+"&count=100&dfrom="+start_date+"&dto="+end_date+"&order=desc&type=1&category_id="+str(category_id)

def get_df_from_url(url):
    data = requests.get(url).json()
    df = pd.DataFrame(data["items"])
    return df

def get_df_from_page(page_nr, start_date, end_date, category_id):
    url = construct_url(page_nr, start_date, end_date, category_id)
    df = get_df_from_url(url)
    return df

def get_df_from_page_and_sleep(page_nr, start_date, end_date, category_id):
    df = get_df_from_page(page_nr, start_date, end_date, category_id)
    sleep(random.uniform(0, 2))
    return df

def get_all_results(start_date, end_date, category_id):

    first_page_url = construct_url(page_nr=1, start_date=start_date, end_date=end_date, category_id=category_id)
    data = requests.get(first_page_url).json()
    total_found = int(data["total_found"])
    max_page = math.ceil(total_found/100)

    print(f"Total entries available: {str(total_found)}\nLargest page number: {max_page}")

    dfs = []

    for i in range(1, max_page+1):
        df = get_df_from_page_and_sleep(i, start_date, end_date, category_id)
        dfs.append(df)

    result_df = pd.concat(dfs).reset_index(drop=True)

    return result_df

def get_article_text(url):
    request = Request(url,None,headers) 
    response_soup = urlopen(request)
    soup = BeautifulSoup(response_soup, "html.parser")
    text = "".join([elem.get_text(strip=True) for elem in soup.find_all("p")])
    return text

### Getting the article urls

In [17]:
start_date="2020-01-01"
end_date="2024-04-20"
category_id=19

result_df = get_all_results(start_date=start_date, end_date=end_date, category_id=category_id)
result_df["full_url"] = "https://www.lrt.lt"+result_df["url"]

#result_df.to_csv("C:/Users/Ugne/Documents/studies/Python/DL-task1/deep-learning-task-2/lrt_articles.csv", index=False)

Total entries available: 11660
Largest page number: 117


### Getting the text for every article

In [None]:
texts = []
for url in result_df["full_url"]:
    result = get_article_text(url)
    texts.append(result)

In [32]:
len(texts)

6160

In [41]:
text_df = pd.DataFrame(
    {"text": list(set(texts))}
)

In [42]:
text_df.to_csv("C:/Users/Ugne/Documents/studies/Python/DL-task1/deep-learning-task-2/lrt_article_texts.csv", index=False, header=False, encoding="utf-8", errors="replace")