In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import unittest
import responses

# URL of the website containing the data
url = 'https://index.minfin.com.ua/ua/russian-invading/casualties/'

# Send a GET request to the website
response = requests.get(url)
response.raise_for_status()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Define the category mapping
category_map = {
    "Танки": "tanks",
    "ББМ": "bbm",
    "Артилерійські системи": "artillery_systems",
    "РСЗВ": "rszv",
    "Засоби ППО": "air_defense",
    "Літаки": "aircraft",
    "Гелікоптери": "helicopters",
    "БПЛА": "uavs",
    "Крилаті ракети": "cruise_missiles",
    "Кораблі (катери)": "ships",
    "Підводні човни": "submarines",
    "Автомобілі та автоцистерни": "vehicles",
    "Спеціальна техніка": "special_equipment",
    "Особовий склад": "personnel"
}

# Function to parse the casualties data from the webpage
def parse_casualties(soup):
    data = []
    dates = soup.find_all('gold')
    tables = soup.find_all('table')

    for date, table in zip(dates, tables):
        date_str = date.text.strip()
        rows = table.find_all('tr')
        row_data = {'date': date_str}

        for row in rows:
            cells = row.find_all('td')
            if len(cells) == 2:
                category = cells[0].text.strip()
                value = cells[1].text.strip().split(' ')[0]
                english_category = category_map.get(category, None)
                if english_category:
                    row_data[english_category] = int(value.replace(',', ''))

        data.append(row_data)
    
    return pd.DataFrame(data)

# Parse the data from the website
new_data = parse_casualties(soup)

# Load existing dataset
existing_file_path = '../data/parsed_data.csv'
existing_df = pd.read_csv(existing_file_path)

# Combine the existing and new data
combined_df = pd.concat([existing_df, new_data], ignore_index=True)

# Remove duplicates based on the 'date' column
combined_df = combined_df.drop_duplicates(subset=['date'])

# Sort by date to maintain order
combined_df['date'] = pd.to_datetime(combined_df['date'], format='%d.%m.%Y')
combined_df = combined_df.sort_values(by='date')
combined_df['date'] = combined_df['date'].dt.strftime('%d.%m.%Y')

# Save the updated dataset back to the CSV file
combined_df.to_csv(existing_file_path, index=False)

print("Data successfully updated.")


Data successfully updated.


usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument 'c:\\Users\\dimam\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-24488IyGAxcgboZBp.json'


AttributeError: 'tuple' object has no attribute 'tb_frame'

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the website containing the data
url = 'https://index.minfin.com.ua/ua/russian-invading/casualties/'

# Define the category mapping
category_map = {
    "Танки": "tanks",
    "ББМ": "bbm",
    "Артилерійські системи": "artillery_systems",
    "РСЗВ": "rszv",
    "Засоби ППО": "air_defense",
    "Літаки": "aircraft",
    "Гелікоптери": "helicopters",
    "БПЛА": "uavs",
    "Крилаті ракети": "cruise_missiles",
    "Кораблі (катери)": "ships",
    "Підводні човни": "submarines",
    "Автомобілі та автоцистерни": "vehicles",
    "Спеціальна техніка": "special_equipment",
    "Особовий склад": "personnel"
}

# Function to parse the casualties data from the webpage
def parse_casualties(soup):
    data = []
    dates = soup.find_all('h4')
    tables = soup.find_all('table')

    for date, table in zip(dates, tables):
        date_str = date.text.strip()
        rows = table.find_all('tr')
        row_data = {'date': date_str}

        for row in rows:
            cells = row.find_all('td')
            if len(cells) == 2:
                category = cells[0].text.strip()
                value = cells[1].text.strip().split(' ')[0]
                english_category = category_map.get(category, None)
                if english_category:
                    row_data[english_category] = int(value.replace(',', ''))

        data.append(row_data)
    
    return pd.DataFrame(data)

# Main function to add new data to the existing dataset
def add_new_data(existing_file_path):
    # Load existing dataset
    existing_df = pd.read_csv(existing_file_path)

    # Fetch and parse new data from the website
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    new_df = parse_casualties(soup)

    # Combine the existing and new data
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)

    # Remove duplicates based on the 'date' column
    combined_df = combined_df.drop_duplicates(subset=['date'])

    # Sort by date to maintain order
    combined_df['date'] = pd.to_datetime(combined_df['date'], format='%d.%m.%Y')
    combined_df = combined_df.sort_values(by='date')
    combined_df['date'] = combined_df['date'].dt.strftime('%d.%m.%Y')

    # Save the updated dataset back to the CSV file
    combined_df.to_csv(existing_file_path, index=False)

    print("Data successfully updated.")

# Paths to the existing dataset
existing_file_path = '../data/parsed_data.csv'

# Add new data to the existing dataset
add_new_data(existing_file_path)


Data successfully updated.
