In [80]:
import pandas as pd
import re

In [81]:
df = pd.read_csv("vw.csv")
df = df.fillna("")

display(df.sample(4))

df.describe()

Unnamed: 0,date,recipient,place,text
3746,Thursday [30 Sept 1915],Margaret Llewelyn Davies,"Asheham, Rodmell [Sussex]","I suppose, Miss Davies, you are full of good w..."
2744,Thursday [15 April 1937],V. Sackville-West,52 Tavistock Sqre. [W.C.1.],"Yes, do come to lunch on Monday 1.30.\nI’m per..."
1248,Wednesday [21 September 1927],V. Sackville-West,"Monks House, Rodmell, [Sussex]",The M.S. has just come. So sorry I forgot it.\...
3156,2nd Dec 39,Judith Stephen,"Monk’s House, Rodmell, Lewes [Sussex]","Dear Judith,\nI’m glad you’ve been reading Jul..."


Unnamed: 0,date,recipient,place,text
count,3756,3756,3756,3756
unique,3528,254,503,3756
top,[December? 1903],Vanessa Bell,"52 Tavistock Square, W.C.1","My dear Hugh,\nI gave Vanessa your message, an..."
freq,14,485,412,1


In [82]:
# Useful constants: Virginia Woolf's lifetime, for validation
VW_BIRTH, VW_DEATH = 1882, 1941

# Helper functions to extract parts of the year
decade = lambda year: int(str(year)[-2:]) # decade(1941) -> 41
century = lambda year: int(str(year)[:-2]) # century(1941) -> 19

In [83]:
"""
Function to extract the year from the text in the `date` column of the dataset

Strategies (in order of priority):
    1. Using the dateparser package
    2. Using a regex for 4 consecutive digits
    3. Using a regex for 2 consecutive digits
"""
def extract_year(full_date: str) -> str:

    if pd.isnull(full_date):
        return None

    try:
        full_date: str = full_date.replace("?", "")
        year = str(dateparser.parse(full_date).year)

    except Exception: # unable to parse
        match = re.search(r"\d{4}", full_date) # try regex for year (4 digits)
        year = match.group(0) if match else None

        if not year:
            matches = re.findall(r"\d{2}", full_date) # try regex for year (2 digits) in the end of the date
            year = matches[-1] if matches else None

            if not year:
                return None

    # correct years written in two digits that were mistakenly parsed as 20th century years, based on VW's lifetime
    if len(year) == 2 or century(year) == "20":
        if decade(year) >= decade(VW_BIRTH): # between 1882 and 1899
            year = "18" + str(decade(year))
        elif decade(year) <= decade(VW_DEATH): # between 1900 and 1941
            year = "19" + str(decade(year))

    # validate if year is in her lifetime otherwise leave it empty
    if int(year) < VW_BIRTH or int(year) > VW_DEATH:
        return None

    return str(year)


In [86]:
df["year"] = df["date"].apply(extract_year)

df = df[["date", "year", "recipient", "place", "text"]] # reordering

# Display examples
df[["date", "year"]].sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3756 entries, 0 to 3755
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       3756 non-null   object
 1   year       3747 non-null   object
 2   recipient  3756 non-null   object
 3   place      3756 non-null   object
 4   text       3756 non-null   object
dtypes: object(5)
memory usage: 146.8+ KB
None


Unnamed: 0,date,year
986,[January 1904],1904
2442,[25 October 1934],1934
3044,Thursday [10 November 1938],1938
727,10th Aug. 1923,1923
1530,Sunday 3rd Nov [1929],1929
140,[May 1917],1917
1591,Tuesday [4 March 1930],1930
1119,[2 November 1926],1926
1606,April 18th 1930,1930
3135,8th Sept [1939],1939


In [87]:
df.to_csv("vw_with_years.csv", index=False)

TypeError: to_csv() got an unexpected keyword argument 'dtype'