In [15]:
import re
from collections import defaultdict
from docx import Document
import pandas as pd

# Define the category mapping
category_map = {
    "Танки": "tanks",
    "ББМ": "bbm",
    "Артилерійські системи": "artillery_systems",
    "РСЗВ": "rszv",
    "Засоби ППО": "air_defense",
    "Літаки": "aircraft",
    "Гелікоптери": "helicopters",
    "БПЛА": "uavs",
    "Ракети": "cruise_missiles",
    "Кораблі (катери)": "ships",
    "Підводні човни": "submarines",
    "Автомобілі та автоцистерни": "vehicles",
    "Спеціальна техніка": "special_equipment",
    "Особовий склад": "personnel"
}

# Load the data from the file
file_path = '../input/data.docx'

def extract_text_from_docx(file_path):
    # Initialize the Document object to read the file
    doc = Document(file_path)
    full_text = []
    # Iterate through each paragraph in the document
    for para in doc.paragraphs:
        # Append the text of each paragraph to the full_text list
        full_text.append(para.text)
    # Join the paragraphs with newline characters and return the full text
    return '\n'.join(full_text)

# Extract text from the DOCX file
data = extract_text_from_docx(file_path)

# Initialize a defaultdict to store the parsed data
parsed_data = defaultdict(dict)

# Define regular expression patterns to match date lines, category lines, and personnel lines
date_pattern = re.compile(r'(\d{2}\.\d{2}\.\d{4})')
category_pattern = re.compile(r'([А-Яа-яІіЄєЇїҐґA-Za-z\s]+)\s+—\s+(\d+)(?:\s+\(\+\d+\))?')
personnel_pattern = re.compile(r'Особовий склад\s+—\s+(?:близько\s+|більше\s+)?(\d+)(?:\s+осіб)?(?:\s+\(\+\д+\))?')

# Split the data into sections by dates using the date_pattern
sections = re.split(date_pattern, data.strip())

# Process each section
for i in range(1, len(sections), 2):
    # Extract the date
    date = sections[i]
    # Split the corresponding categories and values by newline
    entries = sections[i + 1].strip().split('\n')
    # Process each entry in the section
    for entry in entries:
        entry = entry.strip()
        # Match the entry against the category pattern or personnel pattern
        if 'Особовий склад' in entry:
            match = personnel_pattern.match(entry)
        else:
            match = category_pattern.match(entry)
        if match:
            # Extract the category and value from the match groups
            groups = match.groups()
            if len(groups) == 2:
                category, value = groups
            elif len(groups) == 1:
                category = "Особовий склад"
                value = groups[0]
            # Map the category to its English equivalent using category_map
            english_category = category_map.get(category.strip(), category.strip())
            # Store the value in parsed_data under the corresponding date and category
            parsed_data[date][english_category] = int(value.replace('—', ''))

# Convert defaultdict to a regular dictionary
parsed_data = dict(parsed_data)

# Get the list of all unique categories in English
all_categories = set(category_map.values())

# Create a dataset using pandas DataFrame with columns 'date' and sorted categories
columns = ['date'] + sorted([cat for cat in all_categories if cat != "personnel"]) + ["personnel"]
df = pd.DataFrame(columns=columns)

# Populate the DataFrame with parsed data
rows = []
seen_dates = set()
for date, categories in parsed_data.items():
    if date not in seen_dates:
        seen_dates.add(date)
        # Create a row dictionary with the date
        row = {'date': date}
        # Update the row with the categories and their values
        row.update(categories)
        # Append the row to the rows list
        rows.append(row)

# Convert the rows list to a DataFrame
df = pd.DataFrame(rows, columns=columns)

# Fill NaN values with 0 and ensure all values except the date are integers
for column in df.columns:
    if column != 'date':
        df[column] = df[column].fillna(0).astype(int)

# Print the dataset
print("Original Dataset:")
print(df)

# Save the original dataset to a CSV file
df.to_csv('../data/parsed_data.csv', index=False)



Original Dataset:
           date  air_defense  aircraft  artillery_systems    bbm  \
0    18.06.2024          853       359              13959  15307   
1    17.06.2024          853       359              13927  15287   
2    16.06.2024          853       359              13913  15269   
3    15.06.2024          853       359              13855  15263   
4    14.06.2024          849       359              13818  15234   
..          ...          ...       ...                ...    ...   
841  28.02.2022            0        29                 74    816   
842  27.02.2022            0        27                  0    706   
843  26.02.2022            0        16                  0      0   
844  25.02.2022            0        10                  0    516   
845  24.02.2022            0         7                  0    130   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
0               2297          326  1104      0               2344           1   
1  

In [14]:
import re
from collections import defaultdict
from docx import Document
import pandas as pd

# Load the dataset
file_path = '../data/parsed_data.csv'
casualties_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
casualties_data.head()

Unnamed: 0,date,air_defense,aircraft,artillery_systems,bbm,cruise_missiles,helicopters,rszv,ships,special_equipment,submarines,tanks,uavs,vehicles,personnel
0,18.06.2024,853,359,13959,15307,2297,326,1104,0,2344,1,7974,11187,19031,528620
1,17.06.2024,853,359,13927,15287,2296,326,1104,0,2337,1,7958,11167,18991,527390
2,16.06.2024,853,359,13913,15269,2296,326,1104,0,2325,1,7956,11159,18967,526310
3,15.06.2024,853,359,13855,15263,2293,326,1103,0,2322,1,7956,11148,18911,525150
4,14.06.2024,849,359,13818,15234,2286,326,1101,0,2310,1,7936,11097,18854,524060


In [16]:

# Calculating the daily change in casualties of the Russian soldiers in Ukraine

# Sort the DataFrame by date to ensure the correct order for calculating differences
casualties_data['date'] = pd.to_datetime(casualties_data['date'], format='%d.%m.%Y')
casualties_data = casualties_data.sort_values(by='date')
print(casualties_data)


          date  air_defense  aircraft  artillery_systems    bbm  \
845 2022-02-24            0         7                  0    130   
844 2022-02-25            0        10                  0    516   
843 2022-02-26            0        16                  0      0   
842 2022-02-27            0        27                  0    706   
841 2022-02-28            0        29                 74    816   
..         ...          ...       ...                ...    ...   
4   2024-06-14          849       359              13818  15234   
3   2024-06-15          853       359              13855  15263   
2   2024-06-16          853       359              13913  15269   
1   2024-06-17          853       359              13927  15287   
0   2024-06-18          853       359              13959  15307   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
845                0            6     0      0                  0           0   
844                0            7

In [17]:
# Calculate the difference between each row and the previous row, keeping the first row as is
df_diff = casualties_data.copy()
df_diff.iloc[1:,1:] = casualties_data.iloc[1:,1:].diff().fillna(0).astype(int)

# Sort the difference dataset by date in descending order
df_diff = df_diff.sort_values(by='date', ascending=True)

print(df_diff)

          date  air_defense  aircraft  artillery_systems  bbm  \
845 2022-02-24            0         7                  0  130   
844 2022-02-25            0         0                  0    0   
843 2022-02-26            0         6                  0 -516   
842 2022-02-27            0        11                  0  706   
841 2022-02-28            0         2                 74  110   
..         ...          ...       ...                ...  ...   
4   2024-06-14            3         0                 48   26   
3   2024-06-15            4         0                 37   29   
2   2024-06-16            0         0                 58    6   
1   2024-06-17            0         0                 14   18   
0   2024-06-18            0         0                 32   20   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
845                0            6     0      0                  0           0   
844                0            0     0      0           

In [18]:
df_diff.head()

Unnamed: 0,date,air_defense,aircraft,artillery_systems,bbm,cruise_missiles,helicopters,rszv,ships,special_equipment,submarines,tanks,uavs,vehicles,personnel
845,2022-02-24,0,7,0,130,0,6,0,0,0,0,0,0,0,800
844,2022-02-25,0,0,0,0,0,0,0,0,0,0,0,0,0,0
843,2022-02-26,0,6,0,-516,0,11,0,0,0,0,0,0,0,200
842,2022-02-27,0,11,0,706,0,8,0,0,0,0,0,2,0,1500
841,2022-02-28,0,2,74,110,0,3,0,0,0,0,191,1,0,800


In [19]:

# Convert the 'date' column back to string format
df_diff['date'] = df_diff['date'].dt.strftime('%d.%m.%Y')

# Print the difference dataset
print("Difference Dataset (sorted by date ASC):")
print(df_diff)


Difference Dataset (sorted by date ASC):
           date  air_defense  aircraft  artillery_systems  bbm  \
845  24.02.2022            0         7                  0  130   
844  25.02.2022            0         0                  0    0   
843  26.02.2022            0         6                  0 -516   
842  27.02.2022            0        11                  0  706   
841  28.02.2022            0         2                 74  110   
..          ...          ...       ...                ...  ...   
4    14.06.2024            3         0                 48   26   
3    15.06.2024            4         0                 37   29   
2    16.06.2024            0         0                 58    6   
1    17.06.2024            0         0                 14   18   
0    18.06.2024            0         0                 32   20   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
845                0            6     0      0                  0           0   
844 

In [20]:

# Save the difference dataset to a CSV file
df_diff.to_csv('../data/df_daily_casualties_russia.csv', index=False)

# Save the difference dataset to a JSON file
df_diff.to_json('../data/df_daily_casualties_russia.json', orient='records', lines=True)