In [2]:
import re
from collections import defaultdict
from docx import Document
import pandas as pd

# Define the category mapping
category_map = {
    "Танки": "tanks",
    "ББМ": "bbm",
    "Артилерійські системи": "artillery_systems",
    "РСЗВ": "rszv",
    "Засоби ППО": "air_defense",
    "Літаки": "aircraft",
    "Гелікоптери": "helicopters",
    "БПЛА": "uavs",
    "Ракети": "cruise_missiles",
    "Кораблі (катери)": "ships",
    "Підводні човни": "submarines",
    "Автомобілі та автоцистерни": "vehicles",
    "Спеціальна техніка": "special_equipment",
    "Особовий склад": "personnel"
}

# Load the data from the file
file_path = '../input/data.docx'

def extract_text_from_docx(file_path):
    # Initialize the Document object to read the file
    doc = Document(file_path)
    full_text = []
    # Iterate through each paragraph in the document
    for para in doc.paragraphs:
        # Append the text of each paragraph to the full_text list
        full_text.append(para.text)
    # Join the paragraphs with newline characters and return the full text
    return '\n'.join(full_text)

# Extract text from the DOCX file
data = extract_text_from_docx(file_path)

# Initialize a defaultdict to store the parsed data
parsed_data = defaultdict(dict)

# Define regular expression patterns to match date lines, category lines, and personnel lines
date_pattern = re.compile(r'(\d{2}\.\d{2}\.\d{4})')
category_pattern = re.compile(r'([А-Яа-яІіЄєЇїҐґA-Za-z\s]+)\s+—\s+(\d+)(?:\s+\(\+\d+\))?')
personnel_pattern = re.compile(r'Особовий склад\s+—\s+(?:близько\s+|більше\s+)?(\d+)(?:\s+осіб)?(?:\s+\(\+\д+\))?')

# Split the data into sections by dates using the date_pattern
sections = re.split(date_pattern, data.strip())

# Process each section
for i in range(1, len(sections), 2):
    # Extract the date
    date = sections[i]
    # Split the corresponding categories and values by newline
    entries = sections[i + 1].strip().split('\n')
    # Process each entry in the section
    for entry in entries:
        entry = entry.strip()
        # Match the entry against the category pattern or personnel pattern
        if 'Особовий склад' in entry:
            match = personnel_pattern.match(entry)
        else:
            match = category_pattern.match(entry)
        if match:
            # Extract the category and value from the match groups
            groups = match.groups()
            if len(groups) == 2:
                category, value = groups
            elif len(groups) == 1:
                category = "Особовий склад"
                value = groups[0]
            # Map the category to its English equivalent using category_map
            english_category = category_map.get(category.strip(), category.strip())
            # Store the value in parsed_data under the corresponding date and category
            parsed_data[date][english_category] = int(value.replace('—', ''))

# Convert defaultdict to a regular dictionary
parsed_data = dict(parsed_data)

# Get the list of all unique categories in English
all_categories = set(category_map.values())

# Create a dataset using pandas DataFrame with columns 'date' and sorted categories
columns = ['date'] + sorted([cat for cat in all_categories if cat != "personnel"]) + ["personnel"]
df = pd.DataFrame(columns=columns)

# Populate the DataFrame with parsed data
rows = []
seen_dates = set()
for date, categories in parsed_data.items():
    if date not in seen_dates:
        seen_dates.add(date)
        # Create a row dictionary with the date
        row = {'date': date}
        # Update the row with the categories and their values
        row.update(categories)
        # Append the row to the rows list
        rows.append(row)

# Convert the rows list to a DataFrame
df = pd.DataFrame(rows, columns=columns)

# Fill NaN values with 0 and ensure all values except the date are integers
for column in df.columns:
    if column != 'date':
        df[column] = df[column].fillna(0).astype(int)

# Print the dataset
print("Original Dataset:")
print(df)

# Save the original dataset to a CSV file
df.to_csv('../data/parsed_data.csv', index=False)



Original Dataset:
           date  air_defense  aircraft  artillery_systems    bbm  \
0    07.07.2024          879       360              14937  15645   
1    06.06.2024          831       357              13433  15076   
2    05.06.2024          830       357              13385  15036   
3    04.06.2024          827       357              13345  15020   
4    03.06.2024          824       357              13280  15002   
..          ...          ...       ...                ...    ...   
830  28.02.2022            0        29                 74    816   
831  27.02.2022            0        27                  0    706   
832  26.02.2022            0        16                  0      0   
833  25.02.2022            0        10                  0    516   
834  24.02.2022            0         7                  0    130   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
0               2352          326  1115      0               2495           1   
1  

In [3]:
import re
from collections import defaultdict
from docx import Document
import pandas as pd

# Load the dataset
file_path = '../data/parsed_data.csv'
casualties_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
casualties_data.head()

Unnamed: 0,date,air_defense,aircraft,artillery_systems,bbm,cruise_missiles,helicopters,rszv,ships,special_equipment,submarines,tanks,uavs,vehicles,personnel
0,07.07.2024,879,360,14937,15645,2352,326,1115,0,2495,1,8155,11862,20103,550990
1,06.06.2024,831,357,13433,15076,2270,326,1095,0,2230,1,7828,10846,18360,515000
2,05.06.2024,830,357,13385,15036,2270,326,1092,0,2223,1,7806,10805,18297,513700
3,04.06.2024,827,357,13345,15020,2268,326,1092,0,2211,1,7794,10766,18228,512420
4,03.06.2024,824,357,13280,15002,2268,326,1090,0,2199,1,7779,10739,18159,511130


In [4]:

# Calculating the daily change in casualties of the Russian soldiers in Ukraine

# Sort the DataFrame by date to ensure the correct order for calculating differences
casualties_data['date'] = pd.to_datetime(casualties_data['date'], format='%d.%m.%Y')
casualties_data = casualties_data.sort_values(by='date')
print(casualties_data)


          date  air_defense  aircraft  artillery_systems    bbm  \
834 2022-02-24            0         7                  0    130   
833 2022-02-25            0        10                  0    516   
832 2022-02-26            0        16                  0      0   
831 2022-02-27            0        27                  0    706   
830 2022-02-28            0        29                 74    816   
..         ...          ...       ...                ...    ...   
4   2024-06-03          824       357              13280  15002   
3   2024-06-04          827       357              13345  15020   
2   2024-06-05          830       357              13385  15036   
1   2024-06-06          831       357              13433  15076   
0   2024-07-07          879       360              14937  15645   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
834                0            6     0      0                  0           0   
833                0            7

In [5]:
# Calculate the difference between each row and the previous row, keeping the first row as is
df_diff = casualties_data.copy()
df_diff.iloc[1:,1:] = casualties_data.iloc[1:,1:].diff().fillna(0).astype(int)

# Sort the difference dataset by date in descending order
df_diff = df_diff.sort_values(by='date', ascending=True)

print(df_diff)

          date  air_defense  aircraft  artillery_systems  bbm  \
834 2022-02-24            0         7                  0  130   
833 2022-02-25            0         0                  0    0   
832 2022-02-26            0         6                  0 -516   
831 2022-02-27            0        11                  0  706   
830 2022-02-28            0         2                 74  110   
..         ...          ...       ...                ...  ...   
4   2024-06-03            3         0                 47   22   
3   2024-06-04            3         0                 65   18   
2   2024-06-05            3         0                 40   16   
1   2024-06-06            1         0                 48   40   
0   2024-07-07           48         3               1504  569   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
834                0            6     0      0                  0           0   
833                0            0     0      0           

In [6]:
df_diff.head()

Unnamed: 0,date,air_defense,aircraft,artillery_systems,bbm,cruise_missiles,helicopters,rszv,ships,special_equipment,submarines,tanks,uavs,vehicles,personnel
834,2022-02-24,0,7,0,130,0,6,0,0,0,0,0,0,0,800
833,2022-02-25,0,0,0,0,0,0,0,0,0,0,0,0,0,0
832,2022-02-26,0,6,0,-516,0,11,0,0,0,0,0,0,0,200
831,2022-02-27,0,11,0,706,0,8,0,0,0,0,0,2,0,1500
830,2022-02-28,0,2,74,110,0,3,0,0,0,0,191,1,0,800


In [7]:

# Convert the 'date' column back to string format
df_diff['date'] = df_diff['date'].dt.strftime('%d.%m.%Y')

# Print the difference dataset
print("Difference Dataset (sorted by date ASC):")
print(df_diff)


Difference Dataset (sorted by date ASC):
           date  air_defense  aircraft  artillery_systems  bbm  \
834  24.02.2022            0         7                  0  130   
833  25.02.2022            0         0                  0    0   
832  26.02.2022            0         6                  0 -516   
831  27.02.2022            0        11                  0  706   
830  28.02.2022            0         2                 74  110   
..          ...          ...       ...                ...  ...   
4    03.06.2024            3         0                 47   22   
3    04.06.2024            3         0                 65   18   
2    05.06.2024            3         0                 40   16   
1    06.06.2024            1         0                 48   40   
0    07.07.2024           48         3               1504  569   

     cruise_missiles  helicopters  rszv  ships  special_equipment  submarines  \
834                0            6     0      0                  0           0   
833 

In [9]:

# Save the difference dataset to a CSV file
df_diff.to_csv('../data/df_daily_casualties_russia.csv', index=False)

# Save the difference dataset to a JSON file
df_diff.to_json('../data/df_daily_casualties_russia.json', orient='records', lines=True)

In [13]:
# combine daily_weather and df_daily_casualties_russie

# Load the dataset
file_path = '../data/daily_weather.csv'
df_daily_weather = pd.read_csv(file_path)



In [14]:
df_daily_weather

Unnamed: 0,day,cloud_cover_total,weather,summary,weather_id,precipitation_type,precipitation_total,temperature,wind_speed
0,2022-02-24,99.416667,Overcast,Overcast,overcast,none,0.050000,2.616667,3.170833
1,2022-02-25,98.583333,Overcast,Overcast,overcast,none,0.000000,3.141667,3.958333
2,2022-02-26,99.625000,Overcast,Overcast,overcast,none,0.000000,2.312500,3.625000
3,2022-02-27,95.625000,Overcast,Overcast,overcast,none,0.000000,1.350000,3.641667
4,2022-02-28,93.833333,Overcast,Overcast,overcast,none,0.000000,0.250000,2.595833
...,...,...,...,...,...,...,...,...,...
846,2024-06-19,22.541667,Partly sunny,Partly sunny,partly_sunny,none,0.000000,24.850000,1.962500
847,2024-06-20,41.458333,Partly sunny,Partly sunny,partly_sunny,none,0.070833,24.108333,4.200000
848,2024-06-21,10.291667,Sunny,Sunny,sunny,none,0.000000,19.275000,5.258333
849,2024-06-22,21.541667,Partly sunny,Partly sunny,partly_sunny,none,0.000000,21.370833,3.075000


In [15]:
df_daily_weather['day'] = pd.to_datetime(df_daily_weather['day'], format='%Y-%m-%d')
df_daily_weather = df_daily_weather.sort_values(by='day', ascending=False)
df_daily_weather


Unnamed: 0,day,cloud_cover_total,weather,summary,weather_id,precipitation_type,precipitation_total,temperature,wind_speed
850,2024-06-23,29.375000,Partly sunny,Partly sunny,partly_sunny,none,0.000000,23.637500,2.129167
849,2024-06-22,21.541667,Partly sunny,Partly sunny,partly_sunny,none,0.000000,21.370833,3.075000
848,2024-06-21,10.291667,Sunny,Sunny,sunny,none,0.000000,19.275000,5.258333
847,2024-06-20,41.458333,Partly sunny,Partly sunny,partly_sunny,none,0.070833,24.108333,4.200000
846,2024-06-19,22.541667,Partly sunny,Partly sunny,partly_sunny,none,0.000000,24.850000,1.962500
...,...,...,...,...,...,...,...,...,...
4,2022-02-28,93.833333,Overcast,Overcast,overcast,none,0.000000,0.250000,2.595833
3,2022-02-27,95.625000,Overcast,Overcast,overcast,none,0.000000,1.350000,3.641667
2,2022-02-26,99.625000,Overcast,Overcast,overcast,none,0.000000,2.312500,3.625000
1,2022-02-25,98.583333,Overcast,Overcast,overcast,none,0.000000,3.141667,3.958333


In [12]:
df_daily_weather['day'] = df_daily_weather['day'].dt.strftime('%d.%m.%Y')



AttributeError: Can only use .dt accessor with datetimelike values

In [81]:
df_daily_weather

Unnamed: 0,day,cloud_cover_total,weather,summary,weather_id,precipitation_type,precipitation_total,temperature,wind_speed
845,18.06.2024,41.541667,Partly sunny,Partly sunny,partly_sunny,none,0.041667,22.900000,2.458333
844,17.06.2024,62.208333,Partly sunny,Partly sunny,partly_sunny,none,0.066667,20.945833,3.466667
843,16.06.2024,80.208333,Mostly cloudy,Mostly cloudy,mostly_cloudy,none,0.095833,20.833333,4.208333
842,15.06.2024,95.541667,Overcast,Overcast,overcast,none,0.041667,22.900000,3.137500
841,14.06.2024,51.250000,Partly sunny,Partly sunny,partly_sunny,none,0.000000,23.375000,1.862500
...,...,...,...,...,...,...,...,...,...
4,28.02.2022,93.833333,Overcast,Overcast,overcast,none,0.000000,0.250000,2.595833
3,27.02.2022,95.625000,Overcast,Overcast,overcast,none,0.000000,1.350000,3.641667
2,26.02.2022,99.625000,Overcast,Overcast,overcast,none,0.000000,2.312500,3.625000
1,25.02.2022,98.583333,Overcast,Overcast,overcast,none,0.000000,3.141667,3.958333


In [16]:
# 
#df_target = pd.merge(df_diff, df_daily_weather, how="left", on=["day"])

df_target = df_diff.join(df_daily_weather, how="inner")

In [17]:
df_target.tail()

Unnamed: 0,date,air_defense,aircraft,artillery_systems,bbm,cruise_missiles,helicopters,rszv,ships,special_equipment,...,personnel,day,cloud_cover_total,weather,summary,weather_id,precipitation_type,precipitation_total,temperature,wind_speed
4,03.06.2024,3,0,47,22,0,0,1,0,4,...,1270,2022-02-28,93.833333,Overcast,Overcast,overcast,none,0.0,0.25,2.595833
3,04.06.2024,3,0,65,18,0,0,2,0,12,...,1290,2022-02-27,95.625,Overcast,Overcast,overcast,none,0.0,1.35,3.641667
2,05.06.2024,3,0,40,16,2,0,0,0,12,...,1280,2022-02-26,99.625,Overcast,Overcast,overcast,none,0.0,2.3125,3.625
1,06.06.2024,1,0,48,40,0,0,3,0,7,...,1300,2022-02-25,98.583333,Overcast,Overcast,overcast,none,0.0,3.141667,3.958333
0,07.07.2024,48,3,1504,569,82,0,20,0,265,...,35990,2022-02-24,99.416667,Overcast,Overcast,overcast,none,0.05,2.616667,3.170833
