You may need to install 'openpyxl' to run Pandas' read_excel function.

In [1]:
import numpy as np
import pandas as pd
from typing import List, Tuple

from collections import Counter

In [31]:
def age_difference(df: pd.DataFrame, col_a: str, col_b: str) -> pd.DataFrame:
    """
    Calculates the age difference between two correspondents by subtracting their birth dates.

    Arguments:
        df (pd.DataFrame): the dataframe you want to pass through this function.
        col_a (str): select the column of sender birth dates.
        col_b (str): select the column of addressee birth dates.

    Returns:
        pd.DataFrame: an updated data frame with the added column.
    """
    age_diff_labels = []

    for index, row in df.iterrows():
        sender_year = row[col_a]
        addressee_year = row[col_b]
        if pd.notna(sender_year) or pd.notna(addressee_year):
            if sender_year in ["UNK", "MULT"] or addressee_year in ["UNK", "MULT"]:
                age_diff = None
            else:
                difference = abs(sender_year - addressee_year)
                if difference >= 20:
                    age_diff = "TRUE"
                else:
                    age_diff = "FALSE"
            age_diff_labels.append(age_diff)
        else:
            age_diff_labels.append(None)

    df["AGE_GAP_OVER_20"] = age_diff_labels
    return df

# While this function does what it is supposed to, it is not recommended to completely rely on this
# because you will lose information this way: while we don't know all of the birth dates of the
# individuals in the dataset, it is occasionally obvious that there is a +20 years age gap
# between sender and recipient: for instance, we don't know Barbara Hartshorne's exact year of birth,
# (based on Murphy's The Worlds of the Jeake Family of Rye, it is approximately around 1622), but
# it is obvious that she is at least 20 years older than, for instance, her granddaughter Elizabeth
# Jeake (jr). It is also reasonable to assume that she was older than 20 when she birthed her daughter
# Elizabeth Hartshorne/Jeake, since Richard Hartshorne was her third husband (described by Murphy as
# "a couple in their middle age", p.21) after John Holman and George Harding, with whom she both bore
# children as well. By relying on this function we would thus lose all this information,
# so it is best used as a sanity check to see if the manual calculations were done correctly.

In [2]:
def age_diff_int(df: pd.DataFrame, col_a: str, col_b: str) -> pd.DataFrame:
    """
    Calculates the age difference between two correspondents by subtracting their birth dates.
    Same as the above function, except that this returns integer values rather than binary values.

    Arguments:
        df (pd.DataFrame): the dataframe you want to pass through this function.
        col_a (str): select the column of sender birth dates.
        col_b (str): select the column of addressee birth dates.

    Returns:
        pd.DataFrame: an updated data frame with the added column.
    """
    age_diff_values = []

    for index, row in df.iterrows():
        sender_year = row[col_a]
        addressee_year = row[col_b]
        age_diff = None
        if pd.notna(sender_year) or pd.notna(addressee_year):
            if sender_year in ["UNK", "MULT"] or addressee_year in ["UNK", "MULT"]:
                pass
            else:
                age_diff = abs(int(sender_year) - int(addressee_year))
        age_diff_values.append(age_diff)

    df["AGE_GAP"] = age_diff_values
    return df

In [6]:
def age_diff_int(df: pd.DataFrame, col_a: str, col_b: str) -> pd.DataFrame:
    """
    Calculates the age difference between two correspondents by subtracting their birth dates.
    Same as the above function, except that this returns integer values rather than binary values.

    Arguments:
        df (pd.DataFrame): the dataframe you want to pass through this function.
        col_a (str): select the column of sender birth dates.
        col_b (str): select the column of addressee birth dates.

    Returns:
        pd.DataFrame: an updated data frame with the added column.
    """
    age_diff_values = []

    for index, row in df.iterrows():
        sender_year = row[col_a]
        addressee_year = row[col_b]
        age_diff = None
        if pd.notna(sender_year) or pd.notna(addressee_year):
            if sender_year in ["UNK", "MULT"] or addressee_year in ["UNK", "MULT"]:
                pass
            else:
                sender_diff = abs(int(sender_year) - int(addressee_year))
                addressee_diff = abs(int(addressee_year) - int(sender_year))
                age_diff = int(min(sender_diff, addressee_diff))
        age_diff_values.append(age_diff)

    df["AGE_GAP"] = age_diff_values
    return df

def addressee_over_40(df: pd.DataFrame, col_a: str, col_b: str) -> pd.DataFrame:
    """
    Calculates whether the addressee is older than 40 years.

    Arguments:
        df (pd.DataFrame): the dataframe you want to pass through this function.
        col_a (str): select the column of year of writing.
        col_b (str): select the column of addressee birth date.

    Returns:
        pd.DataFrame: an updated data frame with the added column.
    """
    age_labels = []

    for index, row in df.iterrows():
        letter_year = row[col_a]
        addressee_year = row[col_b]
        if pd.notna(letter_year) and pd.notna(addressee_year):
            if addressee_year in ["UNK"]:
                mid_age = "UNK"
            elif addressee_year in ["MULT"]:
                mid_age = "MULT"
            else:
                difference = abs(letter_year - addressee_year)
                if difference >= 40:
                    mid_age = "TRUE"
                else:
                    mid_age = "FALSE"
            age_labels.append(mid_age)
        else:
            age_labels.append(None)

    df["ADDRESSEE_OVER_40"] = age_labels
    return df

def sender_over_40(df: pd.DataFrame, col_a: str, col_b: str) -> pd.DataFrame:
    """
    Calculates whether the sender is older than 40 years.

    Arguments:
        df (pd.DataFrame): the dataframe you want to pass through this function.
        col_a (str): select the column of year of writing.
        col_b (str): select the column of sender birth date.

    Returns:
        pd.DataFrame: an updated data frame with the added column.
    """
    age_labels = []

    for index, row in df.iterrows():
        letter_year = row[col_a]
        sender_year = row[col_b]
        if pd.notna(letter_year) and pd.notna(sender_year):
            if sender_year in ["UNK"]:
                mid_age = "UNK"
            elif sender_year in ["MULT"]:
                mid_age = "MULT"
            else:
                difference = abs(letter_year - sender_year)
                if difference >= 40:
                    mid_age = "TRUE"
                else:
                    mid_age = "FALSE"
            age_labels.append(mid_age)
        else:
            age_labels.append(None)

    df["SENDER_OVER_40"] = age_labels
    return df

def pairs(df: pd.DataFrame, col_a: str, col_b: str, target_col: str="SENDER-ADDRESSEE_PAIR") -> pd.DataFrame:
    """
    Calculates whether the  is older than 40 years.

    Arguments:
        df (pd.DataFrame): the dataframe you want to pass through this function.
        col_a (str): select the column of year of writing.
        col_b (str): select the column of sender birth date.

    Returns:
        pd.DataFrame: an updated data frame with the added column.
    """
    paired = []

    for index, row in df.iterrows():
        sender = row[col_a]
        addressee = row[col_b]

        if pd.notna(sender) and pd.notna(addressee):
            pair = f'"{sender}", "{addressee}"'
            paired.append(pair)
        else:
            paired.append(None)
    
    df[target_col] = paired
    return df

In [6]:
jeake = pd.read_excel("jeake_metadata_v6.xlsx",
                      sheet_name="jeake_metadata")
jeake.head()

Unnamed: 0,TITLE,PAGE,NR,ID,SALUTATION,SIGN-OFF,SENDER_RAW,DATELINE,DATE,YEAR,...,ADDRESSEE_BIRTH_YEAR,ADDRESSEE_GENERATION,ADDRESSEE_IS_+40,GEN_DIFFERENCE,AGE_DIFF_+20,GENDER_SENDER,GENDER_ADDRESSEE,GENDER_PAIR,CONNECTION_TYPE,SENDER-ADDRESSEE_PAIR
0,"[1] Samuel Jeake senior to Mrs Wenborn, 24 Mar...",23,1,ESRO FRE 4223 fo. 60,Godly friend,Your poore friend lately robbed of his chiefes...,Samuel Jeake,"Rie March 24""",1639/40,1640.0,...,UNK,UNK,UNK,UNK,UNK,MALE,FEMALE,MALE-FEMALE,FRIEND,"""SAMUEL JEAKE SR"", ""MRS WENBORN"""
1,[2] Exchange of letters between Samuel Jeake s...,24,2,ESRO FRE 4223 fos. 61-4,Dearely beloved,your loathed,Sa: Jeake,"Rey May 20"" 1641",,1641.0,...,UNK,UNK,UNK,UNK,UNK,MALE,MALE,MALE-MALE,FRIEND,"""SAMUEL JEAKE SR"", ""JOHN COULTON"""
2,[3] Samuel Jeake senior to his friends in the ...,32,3,ESRO FRE 4223 fos. 69-70,Dearest and most entirely beloved friends,Soe prayeth yours whilst his owne,Samuel Jeake,,,,...,MULT,UNK,MULT,UNK,UNK,MALE,MULT,MALE-MULT,FRIEND,"""SAMUEL JEAKE SR"", ""FRIENDS IN THE ARMY"""
3,[4] Samuel Jeake senior and others to General ...,33,4,ESRO FRE 4223 fos. 102-3,Right Honourable,,,,,,...,1612,1600-1640,UNK,UNK,UNK,MULT,MALE,MULT-MALE,POLITICAL,"""SAMUEL JEAKE SR AND OTHERS"", ""GENERAL FAIRFAX"""
4,"[5] Samuel Jeake senior to Frances Hartridge, ...",36,5,ESRO FRE 4223 fo. 104,Most deare & entirely beloved friend,Yours as his owne and more,Samuel Jeake,1650,,1650.0,...,1630,1600-1640,FALSE,0,False,MALE,FEMALE,MALE-FEMALE,FRIEND,"""SAMUEL JEAKE SR"", ""FRANCES JEAKE-HARTRIDGE"""


In [7]:
age_diff_int(jeake, "SENDER_BIRTH_YEAR", "ADDRESSEE_BIRTH_YEAR")
jeake.sample(10)

Unnamed: 0,TITLE,PAGE,NR,ID,SALUTATION,SIGN-OFF,SENDER_RAW,DATELINE,DATE,YEAR,...,ADDRESSEE_GENERATION,ADDRESSEE_IS_+40,GEN_DIFFERENCE,AGE_DIFF_+20,GENDER_SENDER,GENDER_ADDRESSEE,GENDER_PAIR,CONNECTION_TYPE,SENDER-ADDRESSEE_PAIR,AGE_GAP
26,[27| Samuel Jeake senior to Samuel Jeake junio...,61,27,ESRO FRE 4817,Lo: sonne,Thomas Shoosmith & his mother remember to you.”,Sa: Jeake.,"Rye October 24"""" 1668.",,1668.0,...,1640-1680,FALSE,1,True,MALE,MALE,MALE-MALE,FAMILY,"""SAMUEL JEAKE SR"", ""SAMUEL JEAKE JR""",29.0
202,"[203] Thomas Miller to Samuel Jeake junior, 15...",216,203,ESRO FRE 5218,,Pray advise me of the receipt of this letter p...,Tho: Miller,"London the 15"" October 1685",,1685.0,...,1640-1680,FALSE,0,False,MALE,MALE,MALE-MALE,BUSINESS; FRIEND,"""THOMAS MILLER"", ""SAMUEL JEAKE JR""",8.0
143,"[143] John Medley to Barbara Hartshorne, 23 No...",161,143,ESRO FRE 5294,Madam,Your obliged servant,Jn° Medley,,November 23 82,1682.0,...,1600-1640,UNK,UNK,UNK,MALE,FEMALE,MALE-FEMALE,BUSINESS,"""JOHN MEDLEY"", ""BARBARA HARTSHORNE""",
17,"[18] Frances Hartridge to Samuel Jeake senior,...",46,18,ESRO FRE 4223 fo. 118,Deare love,be not unmindfull of the gloves I spake to you...,,,July the 9 1651,1651.0,...,1600-1640,FALSE,1,False,FEMALE,MALE,FEMALE-MALE,FAMILY,"""FRANCES JEAKE-HARTRIDGE"", ""SAMUEL JEAKE SR""",7.0
282,"[283] Francis Jeake to Elizabeth Tucker, 18 Ju...",298,283,"RMA, Selmes Manuscripts, RYEYT N39.59.1",Hon: Mother,Your dutifull Son,Francis Jeake,Tenterden July 18 1712,,1712.0,...,1640-1680,True,1,True,MALE,FEMALE,MALE-FEMALE,FAMILY,"""FRANCIS JEAKE"", ""ELIZABETH JEAKE""",33.0
106,[106] Christopher Blackwood to Samuel Jeake ju...,130,106,ESRO FRE 4946,Deare Cozin,Your affect lo Cozen,Chr Blackwood,"Dublin Aprill the 3"" 1680",,1680.0,...,1640-1680,FALSE,0,False,MALE,MALE,MALE-MALE,FAMILY,"""CHRISTOPHER BLACKWOOD JR"", ""SAMUEL JEAKE JR""",2.0
156,[156] Samuel Jeake senior to Samuel Jeake juni...,175,156,ESRO FRE 5103,Dear Son,Holwell” hath set out an Appendix to his forme...,Sa: Jeake,,June 7 1683,1683.0,...,1640-1680,FALSE,1,True,MALE,MALE,MALE-MALE,FAMILY,"""SAMUEL JEAKE SR"", ""SAMUEL JEAKE JR""",29.0
104,"[104] John Mackley to Samuel Jeake junior, 27 ...",129,104,ESRO FRE 4944,Lo Cousin,I shall be at the sine of the sune the very ne...,John Mackley,"Southwarke March the 27"" 1680",,1680.0,...,1640-1680,FALSE,0,UNK,MALE,MALE,MALE-MALE,EXTENDED FAMILY,"""JOHN MACKLEY"", ""SAMUEL JEAKE JR""",
296,"[297] Elizabeth Tucker to Barbara Jeake, 31 Ma...",309,297,ESRO FRE 5392,,Post paid 3d,E Tucker,,May the 31* 1733,1733.0,...,1680-1720,FALSE,1,True,FEMALE,FEMALE,FEMALE-FEMALE,FAMILY,"""ELIZABETH JEAKE"", ""BARBARA JEAKE""",28.0
239,"[240] Samuel Jeake junior to Elizabeth Jeake, ...",258,240,ESRO FRE 5319,My Dear,Your Ever Lo: husband,Sa: Jeake,"London Nov. 9"" 1697",,1697.0,...,1640-1680,FALSE,0,False,MALE,FEMALE,MALE-FEMALE,FAMILY,"""SAMUEL JEAKE JR"", ""ELIZABETH JEAKE""",15.0


In [22]:
jeake.to_csv("jeake_metadata_updated.csv", index=False)