In [77]:
import re
from collections import defaultdict
from docx import Document
import pandas as pd

# Define the category mapping
category_map = {
    "Танки": "tanks",
    "ББМ": "bbm",
    "Артилерійські системи": "artillery_systems",
    "РСЗВ": "rszv",
    "Засоби ППО": "air_defense",
    "Літаки": "aircraft",
    "Гелікоптери": "helicopters",
    "БПЛА": "uavs",
    "Ракети": "cruise_missiles",
    "Кораблі (катери)": "ships",
    "Підводні човни": "submarines",
    "Автомобілі та автоцистерни": "vehicles",
    "Спеціальна техніка": "special_equipment",
    "Особовий склад": "personnel"
}

# Load the data from the file
file_path = '../input/data.docx'

def extract_text_from_docx(file_path):
    # Initialize the Document object to read the file
    doc = Document(file_path)
    full_text = []
    # Iterate through each paragraph in the document
    for para in doc.paragraphs:
        # Append the text of each paragraph to the full_text list
        full_text.append(para.text)
    # Join the paragraphs with newline characters and return the full text
    return '\n'.join(full_text)

# Extract text from the DOCX file
data = extract_text_from_docx(file_path)

# Initialize a defaultdict to store the parsed data
parsed_data = defaultdict(dict)

# Define regular expression patterns to match date lines, category lines, and personnel lines
date_pattern = re.compile(r'(\d{2}\.\d{2}\.\d{4})')
category_pattern = re.compile(r'([А-Яа-яІіЄєЇїҐґA-Za-z\s]+)\s+—\s+(\d+)(?:\s+\(\+\d+\))?')
personnel_pattern = re.compile(r'Особовий склад\s+—\s+(?:близько\s+|більше\s+)?(\d+)(?:\s+осіб)?(?:\s+\(\+\д+\))?')

# Split the data into sections by dates using the date_pattern
sections = re.split(date_pattern, data.strip())

# Process each section
for i in range(1, len(sections), 2):
    # Extract the date
    date = sections[i]
    # Split the corresponding categories and values by newline
    entries = sections[i + 1].strip().split('\n')
    # Process each entry in the section
    for entry in entries:
        entry = entry.strip()
        # Match the entry against the category pattern or personnel pattern
        if 'Особовий склад' in entry:
            match = personnel_pattern.match(entry)
        else:
            match = category_pattern.match(entry)
        if match:
            # Extract the category and value from the match groups
            groups = match.groups()
            if len(groups) == 2:
                category, value = groups
            elif len(groups) == 1:
                category = "Особовий склад"
                value = groups[0]
            # Map the category to its English equivalent using category_map
            english_category = category_map.get(category.strip(), category.strip())
            # Store the value in parsed_data under the corresponding date and category
            parsed_data[date][english_category] = int(value.replace('—', ''))

# Convert defaultdict to a regular dictionary
parsed_data = dict(parsed_data)

# Get the list of all unique categories in English
all_categories = set(category_map.values())

# Create a dataset using pandas DataFrame with columns 'date' and sorted categories
columns = ['date'] + sorted([cat for cat in all_categories if cat != "personnel"]) + ["personnel"]
df = pd.DataFrame(columns=columns)

# Populate the DataFrame with parsed data
rows = []
seen_dates = set()
for date, categories in parsed_data.items():
    if date not in seen_dates:
        seen_dates.add(date)
        # Create a row dictionary with the date
        row = {'date': date}
        # Update the row with the categories and their values
        row.update(categories)
        # Append the row to the rows list
        rows.append(row)

# Convert the rows list to a DataFrame
df = pd.DataFrame(rows, columns=columns)

# Fill NaN values with 0 and ensure all values except the date are integers
for column in df.columns:
    if column != 'date':
        df[column] = df[column].fillna(0).astype(int)

# Print the dataset
print("Original Dataset:")
print(df)

# Save the original dataset to a CSV file
df.to_csv('../data/parsed_data.csv', index=False)



Original Dataset:
           date  air_defense  aircraft  artillery_systems    bbm  \
0    12.06.2024          844       359              13736  15187   
1    11.06.2024          842       359              13690  15176   
2    10.06.2024          837       358              13644  15144   
3    09.06.2024          836       357              13593  15131   
4    08.06.2024          834       357              13533  15105   
..          ...          ...       ...                ...    ...   
835  28.02.2022            0        29                 74    816   
836  27.02.2022            0        27                  0    706   
837  26.02.2022            0        16                  0      0   
838  25.02.2022            0        10                  0    516   
839  24.02.2022            0         7                  0    130   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
0               2280          326  1099      0               2290           1   
1  

In [83]:
import re
from collections import defaultdict
from docx import Document
import pandas as pd

# Load the dataset
file_path = '../data/parsed_data.csv'
casualties_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
casualties_data.head()

Unnamed: 0,date,air_defense,aircraft,artillery_systems,bbm,cruise_missiles,helicopters,rszv,ships,special_equipment,submarines,tanks,uavs,vehicles,personnel
0,12.06.2024,844,359,13736,15187,2280,326,1099,0,2290,1,7911,11042,18736,521830
1,11.06.2024,842,359,13690,15176,2278,326,1099,0,2288,1,7902,11023,18676,520850
2,10.06.2024,837,358,13644,15144,2278,326,1098,0,2267,1,7879,11010,18618,519750
3,09.06.2024,836,357,13593,15131,2277,326,1097,0,2253,1,7869,10982,18562,518560
4,08.06.2024,834,357,13533,15105,2277,326,1095,0,2248,1,7843,10945,18484,517290


In [89]:

# Calculating the daily change in casualties of the Russian soldiers in Ukraine

# Sort the DataFrame by date to ensure the correct order for calculating differences
casualties_data['date'] = pd.to_datetime(casualties_data['date'], format='%d.%m.%Y')
casualties_data = casualties_data.sort_values(by='date')
print(casualties_data)


          date  air_defense  aircraft  artillery_systems    bbm  \
839 2022-02-24            0         7                  0    130   
838 2022-02-25            0        10                  0    516   
837 2022-02-26            0        16                  0    540   
836 2022-02-27            0        27                  0    706   
835 2022-02-28            0        29                 74    816   
..         ...          ...       ...                ...    ...   
4   2024-06-08          834       357              13533  15105   
3   2024-06-09          836       357              13593  15131   
2   2024-06-10          837       358              13644  15144   
1   2024-06-11          842       359              13690  15176   
0   2024-06-12          844       359              13736  15187   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
839                0            6     0      0                  0           0   
838                0            7

In [101]:
# Calculate the difference between each row and the previous row, keeping the first row as is
df_diff = casualties_data.copy()
df_diff.iloc[1:,1:] = casualties_data.iloc[1:,1:].diff().fillna(0).astype(int)

# Sort the difference dataset by date in descending order
df_diff = df_diff.sort_values(by='date', ascending=True)

print(df_diff)

          date  air_defense  aircraft  artillery_systems  bbm  \
839 2022-02-24            0         7                  0  130   
838 2022-02-25            0         0                  0    0   
837 2022-02-26            0         6                  0   24   
836 2022-02-27            0        11                  0  166   
835 2022-02-28            0         2                 74  110   
..         ...          ...       ...                ...  ...   
4   2024-06-08            1         0                 36    9   
3   2024-06-09            2         0                 60   26   
2   2024-06-10            1         1                 51   13   
1   2024-06-11            5         1                 46   32   
0   2024-06-12            2         0                 46   11   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
839                0            6     0      0                  0           0   
838                0            0     0      0           

In [94]:
df_diff.head()

Unnamed: 0,date,air_defense,aircraft,artillery_systems,bbm,cruise_missiles,helicopters,rszv,ships,special_equipment,submarines,tanks,uavs,vehicles,personnel
0,2024-06-12,2,0,46,11,2,0,0,0,2,0,9,19,60,980
1,2024-06-11,5,1,46,32,0,0,1,0,21,0,23,13,58,1100
2,2024-06-10,1,1,51,13,1,0,1,0,14,0,10,28,56,1190
3,2024-06-09,2,0,60,26,0,0,2,0,5,0,26,37,78,1270
4,2024-06-08,1,0,36,9,7,0,0,0,10,0,9,59,68,1210


In [102]:

# Convert the 'date' column back to string format
df_diff['date'] = df_diff['date'].dt.strftime('%d.%m.%Y')

# Print the difference dataset
print("Difference Dataset (sorted by date ASC):")
print(df_diff)


Difference Dataset (sorted by date ASC):
           date  air_defense  aircraft  artillery_systems  bbm  \
839  24.02.2022            0         7                  0  130   
838  25.02.2022            0         0                  0    0   
837  26.02.2022            0         6                  0   24   
836  27.02.2022            0        11                  0  166   
835  28.02.2022            0         2                 74  110   
..          ...          ...       ...                ...  ...   
4    08.06.2024            1         0                 36    9   
3    09.06.2024            2         0                 60   26   
2    10.06.2024            1         1                 51   13   
1    11.06.2024            5         1                 46   32   
0    12.06.2024            2         0                 46   11   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
839                0            6     0      0                  0           0   
838 

In [103]:

# Save the difference dataset to a CSV file
df_diff.to_csv('../data/df_daily_casualties_russia.csv', index=False)

# Save the difference dataset to a JSON file
df_diff.to_json('../data/df_daily_casualties_russia.json', orient='records', lines=True)