In [59]:
import json
import os
import requests
import time
from bs4 import BeautifulSoup
from tqdm import tqdm
import sys

In [60]:
def collect_urls_and_summaries(CONSTANTS):
    """Collects all urls and summaries from the website"""
    if os.path.isfile(CONSTANTS['SAVE_PROGRESS']):
        # If save file exists, read progress from file
        with open(CONSTANTS['SAVE_PROGRESS']) as f:
            saved_data = json.load(f)
            if saved_data['page'] >= CONSTANTS['end_page'] - 1: # If we have already finished, return False
                return True
            start_page = saved_data['page'] + 1
            urls = saved_data['urls']
            category = saved_data['category']
            headers_list = saved_data['headers']
            author_list = saved_data['authors']
    else:
        # Otherwise, start from the beginning
        start_page = 1
        urls = []
        category = []
        headers_list = []
        author_list = []

    for page in tqdm(range(start_page, CONSTANTS['end_page'])):
        page_url = f'https://scienceblog.com/page/{page}/'
        retry = True
        while retry:
            try:
                resp = CONSTANTS['session'].get(page_url, headers=CONSTANTS['headers'])
                retry = False
            except requests.exceptions.RequestException:
                # Wait for 5 seconds and try again
                time.sleep(5)
        soup = BeautifulSoup(resp.content, 'html.parser')
        if not soup.find_all('div', class_='inside-article'):
            break
        blocks = soup.find_all('div', class_='inside-article')
        for block in blocks:
            category.append(block.find('footer', class_="entry-meta").a.get_text(strip=True))
            headers_list.append(block.find('h2', class_="entry-title").get_text(strip=True))
            author_list.append(block.find('div', class_="entry-meta").get_text(strip=True))
            urls.append(block.find('header', class_="entry-header").a.get('href'))
        time.sleep(1)

        # Save progress every SAVE_EVERY steps
        if page % CONSTANTS['SAVE_EVERY'] == 0 or page >= CONSTANTS['end_page'] - CONSTANTS['SAVE_EVERY']:
            with open(CONSTANTS['SAVE_PROGRESS'], 'w') as f:
                data = {
                    'page': page,
                    'index': len(urls),
                    'urls': urls,
                    'category': category,
                    'headers': headers_list,
                    'authors': author_list,
                }
                json.dump(data, f)

    return True

In [61]:
def collect_text(CONSTANTS, article_urls, category, headers_list, author_list, start_index, articles):
    """ Collects text from the urls """
    
    for i, url in tqdm(enumerate(article_urls[start_index:]), total=len(article_urls[start_index:])):
        index = start_index + i
        retry = True
        
        while retry:
            try:
                resp = CONSTANTS['session'].get(url, headers=CONSTANTS['headers'])
                retry = False
            except requests.exceptions.RequestException:
                # Wait for 5 seconds and try again
                time.sleep(5)
        soup = BeautifulSoup(resp.content, 'lxml')
        
        article = {
            "url": url,
            "category": category[index],
            "header": headers_list[index],
            "author": author_list[index],
            
            "article_text": soup.find('div', class_='entry-content').get_text(strip=True),
        }
        
        try:
            article["date"] = soup.find('time', class_='entry-date published').get_text(strip=True)
        except:
            article["date"] = None
        
        articles.append(article)
        time.sleep(1)
        
        # Save progress every SAVE_EVERY steps
        if i % CONSTANTS['SAVE_EVERY'] == 0 or i >= len(article_urls[start_index:]) - CONSTANTS['SAVE_EVERY']:
            with open(CONSTANTS['SAVE_FILE'], 'w') as f:
                data = {
                    'index': index,
                    'articles': articles
                }
                json.dump(data, f, indent=4)
    
    return 'Done'

In [62]:
def collect_the_articles(CONSTANTS):
    """ Collects all articles """
    
    collect_urls_and_summaries(CONSTANTS) # Collect urls and summaries
    
    with open(CONSTANTS['SAVE_PROGRESS']) as f: # Read urls and summaries
        saved_progress = json.load(f)
        urls = saved_progress['urls']
        category = saved_progress['category']
        headers_list = saved_progress['headers']
        author_list = saved_progress['authors']
    
    if os.path.isfile(CONSTANTS['SAVE_FILE']): # If save file exists, read progress from file
        with open(CONSTANTS['SAVE_FILE']) as f:
            saved_data = json.load(f)
            start_index = saved_data['index']
            if start_index == saved_progress['index']: # If we have already finished, return 
                return 'Already finished'
            articles = saved_data['articles']
            
    else: # Otherwise, start from the beginning
        start_index = 0
        articles = []
        
    return collect_text(CONSTANTS, urls, category, headers_list,
                    author_list, start_index, articles)



In [63]:
# Define constants
CONSTANTS = {
    'SAVE_EVERY': 10,
    'SAVE_PROGRESS': 'progress.json',
    'SAVE_FILE': 'articles.json',
    'headers': {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
    },
    'session': requests.Session(),
    'end_page': 500
}

In [64]:
collect_the_articles(CONSTANTS)

  3%|▎         | 11/429 [00:18<11:55,  1.71s/it]


KeyboardInterrupt: 

In [None]:
# # Define constants
# SAVE_EVERY = 10
# SAVE_PROGRESS = 'progress.json'
# SAVE_FILE = 'articles.json'
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
# session = requests.Session()
# end_page = 20


# # Step 1: Collect article urls and summaries
# if os.path.isfile(SAVE_PROGRESS):
#     # If save file exists, read progress from file
#     with open(SAVE_PROGRESS) as f:
#         saved_progress = json.load(f)
#         if saved_progress['page'] >= end_page:
#             print('Already finished!')
#             exit()
#         start_page = saved_progress['page']
#         start_index = saved_progress['index']
#         urls = saved_progress['urls']
#         category = saved_progress['category']
#         headers_list = saved_progress['headers']
#         author_list = saved_progress['authors']
# else:
#     # Otherwise, start from the beginning
#     start_page = 1
#     start_index = 0
#     urls = []
#     category = []
#     headers_list = []
#     author_list = []

# for page in tqdm(range(start_page, end_page)):
#     page_url = f'https://scienceblog.com/page/{page}/'
#     retry = True
#     while retry:
#         try:
#             resp = session.get(page_url, headers=headers)
#             retry = False
#         except requests.exceptions.RequestException:
#             # Wait for 5 seconds and try again
#             time.sleep(5)
#     soup = BeautifulSoup(resp.content, 'html.parser')
#     if not soup.find_all('div', class_='inside-article'):
#         break
#     blocks = soup.find_all('div', class_='inside-article')
#     for block in blocks:
#         category.append(block.find('footer', class_="entry-meta").a.get_text(strip=True))
#         headers_list.append(block.find('h2', class_="entry-title").get_text(strip=True))
#         author_list.append(block.find('div', class_="entry-meta").get_text(strip=True))
#         urls.append(block.find('header', class_="entry-header").a.get('href'))
#     time.sleep(1)
    
#     # Save progress every SAVE_EVERY steps
#     if page % SAVE_EVERY == 0 or page >= end_page - SAVE_EVERY:
#         with open(SAVE_PROGRESS, 'w') as f:
#             data = {
#                 'page': page,
#                 'index': len(urls),
#                 'urls': urls,
#                 'category': category,
#                 'headers': headers_list,
#                 'authors': author_list,
#             }
#             json.dump(data, f)

# # Step 2: Loop over article urls and scrape information
# if os.path.isfile(SAVE_FILE):
#     # If save file exists, read progress from file
#     with open(SAVE_FILE) as f:
#         saved_progress = json.load(f)
#         start_index = saved_progress['index']
#         articles = saved_progress['articles']
# else:
#     start_index = 0
#     articles = []
    

# for i, url in tqdm(enumerate(urls[start_index:]), total=len(urls[start_index:])):
#     index = start_index + i
#     retry = True
#     while retry:
#         try:
#             resp = session.get(url, headers=headers)
#             retry = False
#         except requests.exceptions.RequestException:
#             # Wait for 5 seconds and try again
#             time.sleep(5)
#     soup = BeautifulSoup(resp.content, 'lxml')
    
#     article = {
#         "url": url,
#         "category": category[index],
#         "header": headers_list[index],
#         "author": author_list[index],
        
#         "article_text": soup.find('div', class_='entry-content').get_text(strip=True),
#     }
    
#     try:
#         article["date"] = soup.find('time', class_='entry-date published').get_text(strip=True)
#     except:
#         article["date"] = None
    
    
    
#     articles.append(article)
#     time.sleep(1)
    
#     # Save progress every SAVE_EVERY steps
#     if i % SAVE_EVERY == 0 or i >= len(urls[start_index:]) - SAVE_EVERY:
#         with open(SAVE_FILE, 'w') as f:
#             data = {
#                 'index': index,
#                 'articles': articles
#             }
#             json.dump(data, f, indent=4)

100%|██████████| 1/1 [00:01<00:00,  1.38s/it]
  2%|▏         | 6/276 [00:07<05:58,  1.33s/it]


KeyboardInterrupt: 

In [65]:
import pandas as pd
df = pd.read_json('progress.json')
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 200)


In [66]:
df['urls'].duplicated().sum()

45

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-race-QuestionAnswer")
model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-race-QuestionAnswer")

context = df['article_text'][1][0:100]
print(context)
inputs = tokenizer(context, return_tensors="pt")
outputs = model.generate(**inputs, max_length=100)
question_answer = tokenizer.decode(outputs[0], skip_special_tokens=False)
question_answer = question_answer.replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "")
question, answer = question_answer.split(tokenizer.sep_token)

print("question:", question)
print("answer:", answer)


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ImportError: 
AutoModelForSeq2SeqLM requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
