In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame


# Introduction
## Purpose of the Notebook
# This notebook is designed to analyze the reported losses from the Russian and Ukrainian conflict,
# providing insights into trends and patterns based on available data.... 
#TODO: EDIT



## Data Sources
# Data is sourced from:
#TODO: Add Sources

# Data Collection
## Load Data

In [3]:
data: DataFrame = pd.read_excel('Data/SourceFiles/Documented/documented_losses.xlsx')

## Cleaning Data
# Handling missing values or incorrect data types

In [4]:
#data.dropna(inplace=True)
print(data.isnull().sum())
print(data.dtypes)

# Clean Column Names
# This removes any spaces, converts all characters to lower case, and handles repetitive naming patterns

In [5]:
data.columns = [col.strip().replace(' ', '_').lower().replace('.1', '_change') for col in data.columns]

# Drop Unnecessary Columns
# Identify and drop columns that are entirely empty or not relevant


In [6]:
columns_to_drop = [col for col in data.columns if 'unnamed' in col]
data.drop(columns=columns_to_drop, inplace=True)

# Convert Date Format
# Converts the 'date' column from string to datetime format for better manipulation


In [7]:
data['date'] = pd.to_datetime(data['date'])

# Handle Missing Values
# Fills missing values with zeros for demonstration; adjust this based on your analysis requirements
# Display the DataFrame information and the first few rows to verify the changes


In [8]:
data.fillna(0, inplace=True)
print(data.info())
print(data.head())

# Statistical summary of numerical columns

# Check unique values and their counts for a specific column, e.g., 'date'


In [9]:
print(data.describe())
print(data['date'].value_counts())

# Range checks for a numerical column, e.g., 'russia_total'

In [10]:
print(data[data['russia_total'] < 0])  # This should return an empty DataFrame if all values are valid

# Check date consistency and order


In [11]:
print(data['date'].min(), data['date'].max())  # Check the range of dates
print(data['date'].is_monotonic_increasing)  # Check if dates are in chronological order

# Recheck for any null values


In [12]:
print(data.isnull().sum().sum())  # This should return 0 if there are no null values left

# Data Analysis
## Descriptive Statistics


In [13]:
print(data.describe())

## Data Grouping
# Group data by a specific criterion

In [14]:
grouped_data = data.groupby('category').sum() 

# Data Visualization
## Time Series Analysis


In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='date', y='losses')
plt.title('Losses Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Losses')
plt.show()

## Comparative Analysis
# Bar chart comparing losses by category


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_data, x=grouped_data.index, y='losses')
plt.title('Comparative Losses by Category')
plt.xlabel('Category')
plt.ylabel('Losses')
plt.xticks(rotation=45)
plt.show()