In [1]:
import os
import re
import glob
import fitz
import pdfkit
import requests
import hashlib
import multiprocessing
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from deep_translator import GoogleTranslator
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
def generate_hash(url):
    """
    Generate a hash from the URL to ensure uniqueness.
    """
    return hashlib.md5(url.encode()).hexdigest()

In [3]:
url = "https://www.ntnu.no/cerg/media"

os.makedirs('../../data/articles', exist_ok=True)

path_wkhtmltopdf = 'C:/Program Files/wkhtmltopdf/bin/wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)

In [4]:
articles = []

response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

def is_readable_article(tag):
    return tag.name == 'a' and tag.get_text(strip=True) == 'Les saken'

a_tags = soup.find_all(is_readable_article)

for a_tag in a_tags:
    link = a_tag['href']
    
    article = {
        'link': link,
    }

    articles.append(article)
    
print(f"A total of {len(articles)} articles found.")

A total of 192 articles found.


### Filter out bad domains

In [5]:
paywalled_domains = ["dagbladet.no", "bodymag.no", "aftenposten.no", "adressa.no", "vg.no", "tronderbladet.no", "kk.no", "reuters.com", "msn.com", "vi.no", "tv2.no", "fosna-folket.no", "wsj.com", "https://www.steinkjer-avisa.no/", "https://www.innherred.no/", "https://www.diabetesqld.org.au/"] # paywall

articles = [article for article in articles if not any(domain in article['link'] for domain in paywalled_domains)]

print(f"{len(articles)} articles left after removing paid domains.")
articles[0]

167 articles left after removing paid domains.


{'link': 'https://www.ringsaker-blad.no/steins-ukentlige-trim-skal-gi-legene-svar/s/5-79-715948'}

In [6]:
import os
from urllib.parse import urlparse
from trafilatura import fetch_url, extract

# Assuming 'articles' is a list of dictionaries, each with a "link" attribute
for article in articles:
    url = article["link"]
    # Parse the domain and set target language based on the domain
    domain = urlparse(url).netloc
    target_language = "no" if domain.endswith(".no") else None

    # Fetch and extract article text
    downloaded = fetch_url(url)
    result = extract(downloaded, url, target_language=target_language)

    # Create the directory if it doesn't exist
    os.makedirs('../../data/media_text', exist_ok=True)

    # Generate the filename
    hash_part = generate_hash(url)
    filename = f'../../data/media_text/{domain}_{hash_part}.txt'
    
    if result:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(result or 'No text extracted')

### Translation

In [7]:
import os
import glob
from deep_translator import GoogleTranslator

class FileTranslator:
    def __init__(self, input_directory, output_directory, source_language='no', target_language='en'):
        self.input_directory = input_directory
        self.output_directory = output_directory
        self.source_language = source_language
        self.target_language = target_language

        os.makedirs(output_directory, exist_ok=True)

    def translate_text(self, text):
        # Split text into chunks of 4900 characters
        text_chunks = [text[i:i+4900] for i in range(0, len(text), 4900)]
        translated_chunks = []

        for chunk in text_chunks:
            translated_chunk = GoogleTranslator(source=self.source_language, target=self.target_language).translate(chunk)
            translated_chunks.append(translated_chunk)

        # Join translated chunks back together
        translated_text = ' '.join(translated_chunks)
        return translated_text

    def translate_file(self, input_file_path):
        output_file_name = os.path.basename(input_file_path)
        output_file_path = os.path.join(self.output_directory, output_file_name)

        with open(input_file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Check if the file name contains ".no"
        if '.no' in output_file_name:
            translated_text = self.translate_text(text)
        else:
            translated_text = text  # No translation, just use the original text

        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(translated_text)

    def process_files(self):
        input_files = glob.glob(os.path.join(self.input_directory, '*.txt'))

        for input_file in input_files:
            self.translate_file(input_file)


In [8]:
input_directory = '../../data/media_text/'
output_directory = '../../data/media_texts_translated/'
file_translator = FileTranslator(input_directory, output_directory)
file_translator.process_files()