In [4]:
import pandas as pd 
from collections import defaultdict
import re

file_path = "C:\\Users\\abpet\\Desktop\\WW1\\WW1_Names_Dates_Casualties.csv"

df_ww1 = pd.read_csv(file_path, encoding='utf-8')
df_ww1_test = df_ww1.copy()


df_ww1_test["Casualties and losses_Right"].head(20)

0                                       1,626\n Unknown
1                                                93,432
2                                               Unknown
3     95,000 casualties[1]\n 13,000 casualties[1]\n ...
4                                               ~ 2,000
5     Unknown, but heavyVulcan Pass (21-22 September...
6     509 killed,4,359 wounded,1,534 missingTotal: 6...
7                                              4,500[4]
8     Total: 160,000[16][14] including 36,000 KIA[g]...
9                                                   NaN
10                                                  NaN
11                                              127,000
12    87,181:[2]8,396 dead30,603 wounded48,182 captured
13    16 Germans killed[1] 55 Askaris killed[1] 76 G...
14                                                  NaN
15    33 killed(mostly due to disease)1 light cruise...
16    c. 420,000[1](95,675\t killed or missing) c. 2...
17    21–30 August:13,873+ [3]  • 1,726+ killed 

First, we will handle all of the entries that are just numbers. Add them to a completed casualties df and remove them from what we are working with to declutter.


In [10]:
def is_just_number(text):
    """
    Returns True if 'text' is a (possibly approximate) numeric value, like:
      "500", "5,000", "~2,000", "c.400"
    ignoring bracketed references.
    """
    if not isinstance(text, str):
        return False
    
    # Remove bracketed references and strip
    text = re.sub(r'\[.*?\]', '', text).strip()
    # Accept optional approximation markers (~, c.), digits, commas, decimal points
    # e.g., "~2,000", "c.400", "1000.5"
    pattern = r'^[~c.\d,]+$'
    return bool(re.match(pattern, text, re.IGNORECASE))

MONTHS = {
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
}


def remove_month_keys_from_result(result_dict):
    """
    Removes any key from 'result_dict' that matches known month names.
    Mutates 'result_dict' in place.
    """
    for key in list(result_dict.keys()):
        if key in MONTHS:
            del result_dict[key]

In [6]:

def parse_number_word_pairs(text):
    """
    Given a raw casualty string like:
      "95,000 casualties[1]\\n 13,000 casualties[1]\\n 2,000 missing"
    Returns a dictionary { "casualties": [95000, 13000], "missing": [2000] }
    
    1. Strips bracketed references [#].
    2. Removes line breaks.
    3. Finds patterns of 'number' followed by 'word'.
    4. Stores them in a dict, where each key is the word (lowercased),
       and the value is a list of numeric values (integers/floats).
    """
    if not isinstance(text, str):
        return {}

    # 1) Remove references like [1], [citation needed], [2] etc.
    text = re.sub(r'\[.*?\]', '', text)

    # 2) Remove newlines, extra spaces
    text = text.replace('\n', ' ')
    text = text.strip()

    # 3) Regex to find patterns of <number> <word> or <number>~ <word>.
    #    Example matches: "509 killed", "4,359 wounded", "95,000 casualties".
    #    We'll allow optional ~ or c. for approximate numbers as well.
    pattern = r'([~c.\d,]+)\s*([A-Za-z]+)'  
    
    #    Explanation:
    #    - ([~c.\d,]+)  : Capture a group that includes digits, commas, periods, or the chars '~' or 'c' (common approx. notation).
    #    - \s*          : optional whitespace.
    #    - ([A-Za-z]+)  : capture a word (letters only).
    #
    #    This is a simplified approach. Real data might contain more complexity.

    matches = re.findall(pattern, text)

    # 4) Build a dictionary from the matches
    result = defaultdict(list)
    for num_str, word_str in matches:
        # Remove extraneous punctuation from the numeric part
        # e.g., "~2,000" -> "2000", "c.400" -> "400"
        cleaned_num = re.sub(r'[~,c.,`]', '', num_str).replace(',', '')
        try:
            num_val = float(cleaned_num)
        except ValueError:
            # If we can't convert, skip
            continue

        # Lowercase the word part
        word_str = word_str.lower()

        # Append the numeric value to the list for this word
        result[word_str].append(num_val)

    return dict(result)



In [7]:


def parse_number_word_pairs(text):
    """
    Given a raw casualty string like:
      "95,000 casualties[1]\\n 13,000 casualties[1]\\n 2,000 missing"
    or a single numeric string like "5,000"
    Returns a dictionary { "casualties": [95000, 13000], "missing": [2000] }
    or { "casualties": [5000] } if it's just a single number.

    1. Strips bracketed references [#].
    2. Removes line breaks.
    3. Finds patterns of 'number word'.
    4. If no pattern is found, checks if it's just a single number.
       If so, treat that as "casualties".
    """
    if not isinstance(text, str):
        return {}

    # 1) Remove references like [1], [citation needed], etc.
    text = re.sub(r'\[.*?\]', '', text)

    # 2) Remove newlines, extra spaces
    text = text.replace('\n', ' ')
    text = text.strip()

    # 3) Regex to find <number> <word> pairs
    pattern = r'([~c.\d,]+)\s*([A-Za-z]+)'
    matches = re.findall(pattern, text)

    result = defaultdict(list)

    if matches:
        # If we found (number, word) pairs, process them
        for num_str, word_str in matches:
            # Clean up the numeric part
            cleaned_num = re.sub(r'[~,c.]', '', num_str).replace(',', '')
            try:
                num_val = float(cleaned_num)
            except ValueError:
                continue  # skip if we can't parse

            # Lowercase the word
            word_str = word_str.lower()

            # Append the numeric value
            result[word_str].append(num_val)
    else:
        # 4) If no matches found, check if the entire string is just a number
        if is_just_number(text):
            # Remove approximation markers/commas from the number
            cleaned_num = re.sub(r'[~,c.]', '', text).replace(',', '')
            try:
                num_val = float(cleaned_num)
                result["casualties"].append(num_val)
            except ValueError:
                pass  # can't parse as float; skip

    return dict(result)


In [None]:
# Example: We'll create a new column with the parsed dictionary
df_ww1_test["Casualties_dict_right"] = df_ww1_test["Casualties and losses_Right"].apply(parse_number_word_pairs)

# And similarly for the Left side
df_ww1_test["Casualties_dict_left"] = df_ww1_test["Casualties and losses_Left"].apply(parse_number_word_pairs)

# Now you can see the dictionary in each row
print(df_ww1_test[["BattleName", "Casualties_dict_right"]].head(20))

df_ww1_test[["BattleName", "Casualties_dict_right"]].to_csv('something.csv', encoding='utf-8-sig')

In [None]:
df_ww1["Casualties_dict_right"] = df_ww1["Casualties and losses_Right"].apply(parse_number_word_pairs)
df_ww1["Casualties_dict_left"] = df_ww1["Casualties and losses_Left"].apply(parse_number_word_pairs)
df_ww1[["BattleName", "Casualties_dict_right"]].head(10)


In [8]:

def clean_text(text):
    """Remove bracketed refs, newlines, extra spaces."""
    if not isinstance(text, str):
        return ""
    # Remove references like [1], [citation needed], etc.
    text = re.sub(r'\[.*?\]', '', text)
    # Replace newlines with spaces
    text = text.replace('\n', ' ')
    # Trim
    text = text.strip()
    return text


total_pattern = re.compile(
    r'(?:'
    r'(?:[Tt]otal\s*[:\-]?\s*([~c.\d,]+))'  # Group 1: "Total: 160,000"
    r'|'
    r'([~c.\d,]+)\s*[Tt]otal'               # Group 2: "160,000 total"
    r')'
)

# 2) The generic number-word pattern, e.g. "509 killed", "4,359 wounded"
#    We'll keep it simple: ([~c.\d,]+)\s*([A-Za-z]+)
number_word_pattern = re.compile(r'([~c.\d,]+)\s*([A-Za-z]+)')
def parse_casualties_with_total(text):
    """
    Extract numeric data from strings containing 'total' or typical "<number> <word>" combos.
    Returns a dict like:
      {
        "killed": [509.0],
        "wounded": [4359.0],
        "missing": [1534.0],
        "total": [6402.0]
      }

    Steps:
    1) Clean the text (remove references, newlines).
    2) Find explicit 'total' values (both "Total: ####" and "#### total").
    3) Find generic "<number> <word>" pairs.
    4) If the same numeric value appears for "total" and another key, keep only "total."
    """
    text = clean_text(text)
    if not text:
        return {}

    result = defaultdict(list)

    # 1) Capture "total" first
    totals = total_pattern.findall(text)
    # total_pattern.findall returns a list of tuples: each tuple has 2 groups:
    #   (group1, group2)
    # If the match was "Total: 160,000", group1="160,000", group2="" (empty).
    # If the match was "160,000 total", group1="", group2="160,000".
    # We'll unify whichever is non-empty.

    for (g1, g2) in totals:
        # unify the numeric string
        num_str = g1 if g1 else g2  # whichever group is non-empty
        cleaned_num = re.sub(r'[~,c.]', '', num_str).replace(',', '')
        try:
            num_val = float(cleaned_num)
            result["total"].append(num_val)
        except ValueError:
            pass

    # 2) Capture number-word pairs
    matches = number_word_pattern.findall(text)
    for (num_str, word_str) in matches:
        # Clean the numeric part
        cleaned_num = re.sub(r'[~,c.]', '', num_str).replace(',', '')
        try:
            num_val = float(cleaned_num)
        except ValueError:
            continue
        word_str = word_str.lower()
        result[word_str].append(num_val)

    # 3) If there's a "total" key, remove duplicates from other keys
    #    For instance, if we see "including": [160000] and "total": [160000],
    #    we can remove "including" or remove that 160000 entry from "including".
    if "total" in result:
        total_vals = set(result["total"])
        # We'll build a set of all numeric values for total
        # Then for each other key, remove any numeric value that also appears in total
        for key in list(result.keys()):
            if key == "total":
                continue
            # Filter out values that are in total
            filtered = [v for v in result[key] if v not in total_vals]
            if filtered:
                result[key] = filtered
            else:
                # If no values remain, we remove the key entirely
                del result[key]

    return dict(result)
# Suppose your DataFrame is df_ww1, and you have columns:
# "Casualties and losses_Right" and "Casualties and losses_Left"

df_ww1["casualties_dict_right"] = df_ww1["Casualties and losses_Right"].apply(parse_casualties_with_total)
df_ww1["casualties_dict_left"]  = df_ww1["Casualties and losses_Left"].apply(parse_casualties_with_total)

# Inspect a few rows
print(df_ww1[["BattleName", "casualties_dict_right"]].head(10))


                       BattleName  \
0  Battle of the Crna Bend (1917)   
1       First Battle of Champagne   
2          Battle of Transylvania   
3       Battle of Soissons (1918)   
4             Battle of Istabulat   
5      First Battle of Petrozsény   
6           Second Battle of Gaza   
7         Third Battle of Krithia   
8           Battle of Łódź (1914)   
9              Battle of the Avre   

                               casualties_dict_right  
0                              {'unknown': [1626.0]}  
1                                                 {}  
2                                                 {}  
3        {'casualties': [95000.0, 13000.0, 11259.0]}  
4                                                 {}  
5        {'september': [22.0], 'prisoners': [526.0]}  
6  {'total': [6444.0], 'killed': [509.0], 'wounde...  
7                                                 {}  
8  {'total': [160000.0], 'kia': [36000.0], 'pows'...  
9                                         

In [9]:
# Pattern for "Total: 160,000" or "160,000 total"
total_pattern = re.compile(
    r'(?:'
    r'(?:[Tt]otal\s*[:\-]?\s*([~c.\d,]+))'  # group 1: "Total: 160,000"
    r'|'
    r'([~c.\d,]+)\s*[Tt]otal'               # group 2: "160,000 total"
    r')'
)

# Pattern for "X ([-–] Y)? <descriptor>"
# e.g. "10,000–12,000 men", "c.400 infantry"
strength_pattern = re.compile(
    r'([~c.\d,]+)'                 # first number
    r'(?:\s*[-–]\s*([~c.\d,]+))?'  # optional range second number
    r'\s+([A-Za-z]+)'              # descriptor (e.g. "men", "infantry")
)

def parse_strength(text):
    """
    Parses strength-related strings (e.g. "10,000 men", "c.400 infantry", "8,000–12,000 cavalry").
    Also looks for "Total: 50,000" or "50,000 total".

    Steps:
      1. Clean the text (remove references).
      2. Find explicit 'total' values (both "Total: ####" and "#### total").
      3. Find patterns of <number> or <range> plus <descriptor>.
      4. If no pattern is found but it's a single number (including "c.300"), interpret as 'troops'.
      5. Rename "men" -> "troops" and "in" -> "troops" (if they appear as keys).
      6. If a range is found (x–y), store both x and y (in ascending order).
      7. Remove any month keys (like "august") if they ended up in the dict.

    Returns a dict, e.g.:
      {
         "troops": [10000.0, 12000.0],
         "infantry": [400.0],
         "total": [50000.0]
      }
    """
    text = clean_text(text)
    if not text:
        return {}

    result = defaultdict(list)

    # --- Step 2: Look for 'total' ---
    totals = total_pattern.findall(text)
    # Each item in 'totals' is a tuple: (g1, g2)
    # If "Total: 160,000" => ( "160,000", "" )
    # If "160,000 total" => ( "", "160,000" )
    for g1, g2 in totals:
        num_str = g1 if g1 else g2
        cleaned = re.sub(r'[~,c.]', '', num_str).replace(',', '')
        try:
            val = float(cleaned)
            result["total"].append(val)
        except ValueError:
            pass

    # --- Step 3: Match <number> [- <number>] <descriptor> ---
    matches = strength_pattern.findall(text)
    for num_str1, num_str2, word_str in matches:
        # Convert the first number
        num_str1_clean = re.sub(r'[~,c.]', '', num_str1).replace(',', '')
        try:
            val1 = float(num_str1_clean)
        except ValueError:
            continue

        # Convert the second number if present
        val2 = None
        if num_str2:
            num_str2_clean = re.sub(r'[~,c.]', '', num_str2).replace(',', '')
            try:
                val2 = float(num_str2_clean)
            except ValueError:
                val2 = None

        descriptor = word_str.lower()

        # If men or in => troops
        if descriptor in ("men", "in"):
            descriptor = "troops"

        # If there's a second number, reorder if needed
        if val2 is not None:
            low, high = (val1, val2) if val1 <= val2 else (val2, val1)
            result[descriptor].extend([low, high])
        else:
            result[descriptor].append(val1)

    # --- Step 4: If still no results, check if entire string is a single number => 'troops'
    if not result and is_just_number(text):
        single_str = re.sub(r'[~,c.]', '', text).replace(',', '')
        try:
            val = float(single_str)
            result["troops"].append(val)
        except ValueError:
            pass

    # --- Step 5: Another pass if you want to rename keys that might appear outside the pattern ---
    rename_map = {"men": "troops", "in": "troops"}
    for old_key in list(result.keys()):
        if old_key in rename_map:
            new_key = rename_map[old_key]
            result[new_key].extend(result[old_key])
            del result[old_key]

    # --- Step 6: Remove month keys ---
    remove_month_keys_from_result(result)

    # Return a normal dict
    return dict(result)

In [12]:
df_ww1["strength_dict_right"] = df_ww1["Strength_Right"].apply(parse_strength)
df_ww1["strength_dict_left"]  = df_ww1["Strength_Left"].apply(parse_strength)

# Preview results
df_ww1[["BattleName", "Strength_Right", "strength_dict_right"]].head(10)

df_ww1.to_csv("WW1_strength_parsed.csv", index=False)