In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame


# Introduction
## Purpose of the Notebook
# This notebook is designed to analyze the reported losses from the Russian and Ukrainian conflict,
# providing insights into trends and patterns based on available data.... 
#TODO: EDIT



## Data Sources
# Data is sourced from:
#TODO: Add Sources

# Data Collection
## Load Data

In [3]:
data: DataFrame = pd.read_excel('Data/SourceFiles/Documented/documented_losses.xlsx')

## Cleaning Data
# Handling missing values or incorrect data types

In [4]:
#data.dropna(inplace=True)
print(data.isnull().sum())
print(data.dtypes)

Date                                  0
Russia_Total                          0
Change                                1
Ukraine_Total                         0
Change.1                              1
                                   ... 
Ukraine_Artillery                     0
Unnamed: 63                         775
UNHCR_Ukraine_Border                572
UNHCR_Ukraine_Refugees              572
UNHCR_Returning_Ukraine_Refugees    573
Length: 67, dtype: int64
Date                                 object
Russia_Total                          int64
Change                              float64
Ukraine_Total                         int64
Change.1                            float64
                                     ...   
Ukraine_Artillery                     int64
Unnamed: 63                         float64
UNHCR_Ukraine_Border                float64
UNHCR_Ukraine_Refugees              float64
UNHCR_Returning_Ukraine_Refugees    float64
Length: 67, dtype: object


# Clean Column Names
# This removes any spaces, converts all characters to lower case, and handles repetitive naming patterns

In [5]:
data.columns = [col.strip().replace(' ', '_').lower().replace('.1', '_change') for col in data.columns]

# Drop Unnecessary Columns
# Identify and drop columns that are entirely empty or not relevant


In [6]:
columns_to_drop = [col for col in data.columns if 'unnamed' in col]
data.drop(columns=columns_to_drop, inplace=True)

# Convert Date Format
# Converts the 'date' column from string to datetime format for better manipulation


In [7]:
data['date'] = pd.to_datetime(data['date'])

# Handle Missing Values
# Fills missing values with zeros for demonstration; adjust this based on your analysis requirements
# Display the DataFrame information and the first few rows to verify the changes


In [8]:
data.fillna(0, inplace=True)
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775 entries, 0 to 774
Data columns (total 49 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   date                              775 non-null    datetime64[ns]
 1   russia_total                      775 non-null    int64         
 2   change                            775 non-null    float64       
 3   ukraine_total                     775 non-null    int64         
 4   change_change                     775 non-null    float64       
 5   ratio_ru/ua                       775 non-null    float64       
 6   russia_destroyed                  775 non-null    int64         
 7   ukraine_destroyed                 775 non-null    int64         
 8   russia_damaged                    775 non-null    int64         
 9   ukraine_damaged                   775 non-null    int64         
 10  ukraine_abandoned                 775 non-null    

# Statistical summary of numerical columns

# Check unique values and their counts for a specific column, e.g., 'date'


In [9]:
print(data.describe())
print(data['date'].value_counts())

                                date  russia_total      change  ukraine_total  \
count                            775    775.000000  775.000000     775.000000   
mean   2023-03-17 23:59:59.999999744   9111.423226   19.695484    3031.383226   
min              2022-02-24 00:00:00      0.000000  -65.000000       0.000000   
25%              2022-09-05 12:00:00   5478.000000    0.000000    1556.000000   
50%              2023-03-18 00:00:00   9678.000000    9.000000    3101.000000   
75%              2023-09-27 12:00:00  12297.000000   33.000000    4502.000000   
max              2024-04-08 00:00:00  15264.000000  279.000000    5526.000000   
std                              NaN   3879.981356   28.564993    1527.558413   

       change_change  ratio_ru/ua  russia_destroyed  ukraine_destroyed  \
count     775.000000   775.000000        775.000000         775.000000   
mean        7.130323     3.164233       6022.892903        1958.929032   
min       -30.000000     0.000000          0.000

# Range checks for a numerical column, e.g., 'russia_total'

In [10]:
print(data[data['russia_total'] < 0])  # This should return an empty DataFrame if all values are valid

Empty DataFrame
Columns: [date, russia_total, change, ukraine_total, change_change, ratio_ru/ua, russia_destroyed, ukraine_destroyed, russia_damaged, ukraine_damaged, ukraine_abandoned, russia_abandoned, russia_captured, ukraine_captured, russia_tanks, ukraine_tanks, russia_tank_capture, ukraine_tank_capture, russia_afv, ukraine_afv, russia_afv_capture, ukraine_afv_capture, russia_ifv, ukraine_ifv, russia_apc, ukraine_apc, russia_imv, ukraine_imv, russia_engineering, ukraine_engineering, russia_coms, ukraine_coms, russia_vehicles, ukraine_vehicles, russia_aircraft, ukraine_aircraft, russia_infantry, ukraine_infantry, russia_logistics, ukraine_logistics, russia_armor, ukraine_armor, russia_antiair, ukraine_antiair, russia_artillery, ukraine_artillery, unhcr_ukraine_border, unhcr_ukraine_refugees, unhcr_returning_ukraine_refugees]
Index: []

[0 rows x 49 columns]


# Check date consistency and order


In [11]:
print(data['date'].min(), data['date'].max())  # Check the range of dates
print(data['date'].is_monotonic_increasing)  # Check if dates are in chronological order

2022-02-24 00:00:00 2024-04-08 00:00:00
True


# Recheck for any null values


In [12]:
print(data.isnull().sum().sum())  # This should return 0 if there are no null values left

0


# Data Analysis
## Descriptive Statistics


In [13]:
print(data.describe())

                                date  russia_total      change  ukraine_total  \
count                            775    775.000000  775.000000     775.000000   
mean   2023-03-17 23:59:59.999999744   9111.423226   19.695484    3031.383226   
min              2022-02-24 00:00:00      0.000000  -65.000000       0.000000   
25%              2022-09-05 12:00:00   5478.000000    0.000000    1556.000000   
50%              2023-03-18 00:00:00   9678.000000    9.000000    3101.000000   
75%              2023-09-27 12:00:00  12297.000000   33.000000    4502.000000   
max              2024-04-08 00:00:00  15264.000000  279.000000    5526.000000   
std                              NaN   3879.981356   28.564993    1527.558413   

       change_change  ratio_ru/ua  russia_destroyed  ukraine_destroyed  \
count     775.000000   775.000000        775.000000         775.000000   
mean        7.130323     3.164233       6022.892903        1958.929032   
min       -30.000000     0.000000          0.000

## Data Grouping
# Group data by a specific criterion

In [14]:
grouped_data = data.groupby('category').sum() 

KeyError: 'category'

# Data Visualization
## Time Series Analysis


In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='date', y='losses')
plt.title('Losses Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Losses')
plt.show()

## Comparative Analysis
# Bar chart comparing losses by category


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_data, x=grouped_data.index, y='losses')
plt.title('Comparative Losses by Category')
plt.xlabel('Category')
plt.ylabel('Losses')
plt.xticks(rotation=45)
plt.show()