In [1]:
import pandas as pd
import requests
from datetime import date, datetime, timedelta
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
"""Шаг 1: получаем историческую БД"""
start_date = '2017-01-01'
end_date = '2017-01-01'
current_date = '2023-09-01'

with open('api_key.txt', 'r') as api_key_file:
    api_key = api_key_file.read().strip()


with open('tags.txt', 'r') as file:
    tags = [line.strip() for line in file]

result_data = []
while end_date <= current_date:
    # Объединяем теги в одну строку через '|'. В документации означет OR
    tags_query = '|'.join(tags)
    
    params = {
        'api-key': api_key,
        'q': tags_query,
        'from-date': start_date,
        'to-date': end_date,
        'page-size': 100,
        'show-fields': ['body', 'bodyText']
    }

    url = 'https://content.guardianapis.com/search'

    response = requests.get(url, params=params)

    data = response.json()
    for article in data['response']['results']:
        # Определяем список тегов, которые были найдены в тексте статьи или в заголовке
        found_tags = [tag for tag in tags if (tag.lower() in article['webTitle'].lower()) or (tag.lower() in article['fields']['bodyText'].lower())]
        if found_tags:
            result_data.append({
                'type': article['type'],
                'sectionId': article['sectionId'],
                'sectionName': article['sectionName'],
                'webPublicationDate': article['webPublicationDate'],
                'webTitle': article['webTitle'],
                'text': article['fields']['bodyText'],
                'webUrl': article['webUrl'],
                'apiUrl': article['apiUrl'],
                'searchedTags': found_tags  
            })

    start_date = (date.fromisoformat(start_date) + timedelta(days=1)).isoformat()
    end_date = (date.fromisoformat(end_date) + timedelta(days=1)).isoformat()

df = pd.DataFrame(result_data)

df.to_csv('guardian_archive.csv', index=False)

df


In [7]:
"""Шаг 2. Импортируем csv файл в бд"""
import sqlite3
from sqlite3 import Error

def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection


In [8]:
conn = create_connection("news_database.db")

df = pd.read_csv('guardian_archive.csv')

df.to_sql(name="news_guardian", con=conn, index=False, if_exists="replace")

conn.close()


Connection to SQLite DB successful


In [12]:
"""Шаг 3. Обновляем историческую базу. Изначально я выгрузил все новости в период с 2017-01-01 по 2023-09-01. 
Затем обновил бд с 2023-09-02 по 2023-09-22. Затем попробовал обновить бд за сегодняший день. Отрабатывает корректно
"""

import requests
from datetime import datetime, timedelta, date
import sqlite3
import logging

# Настройка логирования
logging.basicConfig(filename='update_database.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def update_database(start_date, end_date):
    success = False  # Флаг успешного обновления, изначально считаем, что обновление неуспешно
    try:
        # Чтение API-ключа из файла
        with open('api_key.txt', 'r') as api_key_file:
            api_key = api_key_file.read().strip()

        # Чтение ключевых слов из файла
        with open('tags.txt', 'r') as file:
            tags = [line.strip() for line in file]

        result_data = []

        while start_date <= end_date:
            # Объединяем теги в одну строку через '|'
            tags_query = '|'.join(tags)

            params = {
                'api-key': api_key,
                'q': tags_query,
                'from-date': start_date,
                'to-date': end_date,
                'page-size': 100,
                'show-fields': ['body', 'bodyText']
            }

            url = 'https://content.guardianapis.com/search'

            response = requests.get(url, params=params)

            if response.status_code == 200:
                data = response.json()
                for article in data['response']['results']:
                    # Определяем список тегов, которые были найдены в тексте статьи или в заголовке
                    found_tags = [tag for tag in tags if (tag.lower() in article['webTitle'].lower()) or (tag.lower() in article['fields']['bodyText'].lower())]
                    if found_tags:
                        result_data.append({
                            'type': article['type'],
                            'sectionId': article['sectionId'],
                            'sectionName': article['sectionName'],
                            'webPublicationDate': article['webPublicationDate'],
                            'webTitle': article['webTitle'],
                            'text': article['fields']['bodyText'],
                            'webUrl': article['webUrl'],
                            'apiUrl': article['apiUrl'],
                            'searchedTags': '/'.join(found_tags)  # Используем только найденные теги
                        })

                start_date = (date.fromisoformat(start_date) + timedelta(days=1)).isoformat()
            else:
                error_message = f"Ошибка при запросе к API: {response.status_code}"
                logging.error(error_message)
                print(error_message)
                raise Exception(error_message)  # Вызываем исключение для прерывания выполнения и записи в лог

        conn = sqlite3.connect("news_database.db")
        cursor = conn.cursor()

        # Вставляем данные в таблицу
        for new_data in result_data:
            cursor.execute("INSERT INTO news_guardian (type, sectionId, sectionName, webPublicationDate, webTitle, text, webUrl, apiUrl, searchedTags) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
                        (new_data['type'], new_data['sectionId'], new_data['sectionName'],
                        new_data['webPublicationDate'], new_data['webTitle'], 
                        new_data['text'], new_data['webUrl'], new_data['apiUrl'], new_data['searchedTags']))

        # Сохраняем изменения и закрываем соединение
        conn.commit()
        conn.close()
        success = True  # Обновление успешно завершено
        logging.info("Обновление базы данных завершено успешно!")

    except Exception as e:
        error_message = f"Произошла ошибка: {str(e)}"
        logging.error(error_message)
        print(error_message)

    if not success:
        logging.warning("Обновление базы данных завершилось с ошибкой!")

if __name__ == "__main__":
    # Устанавливаем даты для обновления
    #start_date = '2023-09-02'
    start_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
    end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

    update_database(start_date, end_date)


In [3]:
"""Проверка. Я сделал update базы 2023-09-25 за прошлый день. Как мы видим данные за 2023-09-24 содержаться в таблице
Итого наша база имеет 36476 статей, касающихся экологии и зеленой энергетики по заданным нами тегам."""
import sqlite3
import pandas as pd

# Устанавливаем соединение с базой данных
conn = sqlite3.connect("news_database.db")

# Загружаем все данные из таблицы в DataFrame
df = pd.read_sql_query("SELECT * FROM news_guardian", conn)

# Закрываем соединение с базой данных
conn.close()
df


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,type,sectionId,sectionName,webPublicationDate,webTitle,text,webUrl,apiUrl,searchedTags
0,0.0,0.0,article,sustainable-business,Guardian Sustainable Business,2017-01-01T23:00:08Z,The five innovations that shaped sustainabilit...,It’s been a rollercoaster of a year. In the wo...,https://www.theguardian.com/sustainable-busine...,https://content.guardianapis.com/sustainable-b...,Climate Change
1,1.0,1.0,article,commentisfree,Opinion,2017-01-01T17:00:01Z,The BBC’s Planet Earth II did not help the nat...,It has been wonderful watching Planet Earth II...,https://www.theguardian.com/commentisfree/2017...,https://content.guardianapis.com/commentisfree...,Climate Change
2,2.0,2.0,article,commentisfree,Opinion,2017-01-01T00:04:40Z,The Observer view on the prospects for 2017 | ...,It is the conceit of every generation to belie...,https://www.theguardian.com/commentisfree/2017...,https://content.guardianapis.com/commentisfree...,Climate Change
3,3.0,3.0,article,us-news,US news,2017-01-01T21:56:36Z,Obama treads on Trump's Twitter turf to reflec...,"Barack Obama on Sunday used Twitter, a medium ...",https://www.theguardian.com/us-news/2017/jan/0...,https://content.guardianapis.com/us-news/2017/...,Climate Change/Clean Energy
4,4.0,4.0,article,uk-news,UK news,2017-01-01T15:59:04Z,London must stand together in 2017 if its gold...,"For three decades, with barely a blip, the UK ...",https://www.theguardian.com/uk-news/davehillbl...,https://content.guardianapis.com/uk-news/daveh...,Air Quality
...,...,...,...,...,...,...,...,...,...,...,...
36471,,,article,commentisfree,Opinion,2023-09-24T07:02:16Z,"Before we get to the election, Lib Dems need t...","If I remember correctly, the Lib Dem logo is s...",https://www.theguardian.com/commentisfree/2023...,https://content.guardianapis.com/commentisfree...,Renewable Energy
36472,,,article,environment,Environment,2023-09-24T17:36:22Z,Jean Combes obituary,"Every year from the age of 20 my mother, Jean ...",https://www.theguardian.com/environment/2023/s...,https://content.guardianapis.com/environment/2...,Climate Change
36473,,,article,world,World news,2023-09-24T09:00:19Z,‘Capitalism is dead. Now we have something muc...,What could be more delightful than a trip to G...,https://www.theguardian.com/world/2023/sep/24/...,https://content.guardianapis.com/world/2023/se...,Extreme Weather
36474,,,article,world,World news,2023-09-24T09:00:20Z,"Costs, delays and labour shortages threaten th...","In the distance, beyond the clouds of sand kic...",https://www.theguardian.com/world/2023/sep/24/...,https://content.guardianapis.com/world/2023/se...,Climate Change/Sustainable Development
