In [1]:
import os
import re
import pandas as pd

In [2]:
# Function to extract metadata and text from a file
def extract_metadata(file_path):
    metadata = {}
    text_lines = []
    with open(file_path, 'r', encoding='utf-8') as file:
        in_metadata_section = True
        for line in file:
            # Extract metadata lines starting with #
            if in_metadata_section:
                if line.startswith('#'):
                    # Split the line by ': ' and ensure we have two parts (key-value pair)
                    parts = line[2:].strip().split(': ', 1)
                    if len(parts) == 2:
                        key, value = parts
                        metadata[key] = value
                else:
                    in_metadata_section = False
                    # First non-metadata line, start collecting text content
                    text_lines.append(line.strip())
            else:
                # Collect text content after metadata
                text_lines.append(line.strip())

    # Join text lines to form the full text content
    text_content = "\n".join(text_lines)

    return metadata, text_content



In [3]:
# Please make sure to download CLMET dataset and place all texts in .txt (default) in the folder specified under txt_folder
# Folder containing the text files
txt_folder = "/Users/kseniadvorkina/Documents/clmet cleaned/txt"

# Initialize lists to store metadata and text content
metadata_list = []
text_list = []

# Iterate over all text files in the directory
for filename in os.listdir(txt_folder):
    if filename.endswith('.txt'):
        file_path = os.path.join(txt_folder, filename)
        metadata, text_content = extract_metadata(file_path)
        metadata_list.append(metadata)
        text_list.append(text_content)

# Convert the metadata list into a pandas DataFrame
df_metadata = pd.DataFrame(metadata_list)

# Add the text content as a new column in the DataFrame
df_metadata['text'] = text_list

In [4]:
def clean_text(text):
    # Step 1: Print and remove all patterns matching "} number {" and "} numberX {"
    #matches_curly_braces = re.findall(r'\} ?(\d{1,3}[a-cA-C]?) ?\{', text)
    #for match in matches_curly_braces:
    #    print(f"Found in curly braces: }} {match} {{")
    text = re.sub(r'\} ?\d{1,3}[a-cA-C]? ?\{', '', text)

    # Step 2: Print and remove all patterns matching "[ number ]"
    #matches_square_brackets = re.findall(r'\[ ?(\d{1,3}) ?\]', text)
    #for match in matches_square_brackets:
    #    print(f"Found in square brackets: [ {match} ]")
    text = re.sub(r'\[ ?\d{1,3} ?\]', '', text)
    # step 2.1: same as 2 but extra spaces [ 1 ]
    text = re.sub(r'\[\s*\d+\s*\]', '', text)

    

    # Step 2.2: Print and remove patterns like "[  pg 154  ]" and "[  Pg 154  ]"
    #matches_pg_brackets = re.findall(r'\[ ?[pP][gG] ?(\d{1,3}) ?\]', text)
    #for match in matches_pg_brackets:
    #    print(f"Found page reference: [ pg {match} ]")
    text = re.sub(r'\[ ?[pP][gG] ?\d{1,3} ?\]', '', text)
    # step 2.3: same as 2.1 but extra spaces [ pg 154 ] or [ Pg 10 ] 
    text = re.sub(r'\[\s*[pP][gG]\s*\d+\s*\]', '', text)

    # step 2.4: same but p
    text = re.sub(r'\[\s*[pP]\s*\d+\s*\]', '', text)

    # Step 3: Remove all patterns matching "ßnumber." or "ß number"
    text = re.sub(r'ß ?\d{1,3}\.', '', text)
    text = re.sub(r'ß ?\d{1,3}', '', text)

    # Step 4: Remove "* * * * *"
    text = re.sub(r'\* \* \* \* \*', '', text)

    # Step 5: Remove words ending with "IToC" (e.g., "IIIToC", "SomeWordIToC")
    text = re.sub(r'\b\w*IToC\b', '', text)

    # Step 6: Remove tags like <page type="Page 50"/>
    text = re.sub(r'<.*?>', '', text)

    # Step 7: Replace multiple newlines with a single one
    text = re.sub(r'\n+', '\n', text).strip()

    # Step 8: Handle specific square bracket cases (Illustration, Footnotes)
    #matches_specific_brackets = re.findall(r'\[ ?([^\]]*Illustration[^\]]*|[^\]]*Footnote[^\]]*) ?\]', text)
    #for match in matches_specific_brackets:
    #    print(f"Found specific content in square brackets: [ {match} ]")
    #text = re.sub(r'\[ ?([^\]]*Illustration[^\]]*|[^\]]*Footnote[^\]]*) ?\]', '', text)

    # Step 8.1: Handle Illustrations
    #matches_specific_brackets = re.findall(r'\[ ?(Illustration[^\]]*|illustration[^\]]*) ?\]', text)
    #for match in matches_specific_brackets:
    #    print(f"Found specific content in square brackets: [ {match} ]")
    #text = re.sub(r'\[ ?(Illustration[^\]]*|illustration[^\]]*) ?\]', '', text)
    # only cleaning out illustrations withous context
    text = re.sub(r'\[\s*illustration\s*\]', '', text, flags=re.IGNORECASE)
    # same for image
    text = re.sub(r'\[\s*image\s*\]', '', text, flags=re.IGNORECASE)


    # Step 8.1: Handle footnotes
    #matches_specific_brackets = re.findall(r'\[ ?([^\]]*Footnote[^\]]*) ?\]', text)
    #for match in matches_specific_brackets:
    #    print(f"Found specific content in square brackets: [ {match} ]")
    #text = re.sub(r'\[ ?([^\]]*Footnote[^\]]*) ?\]', '', text)

    # for [ 11a ] [ 11b ] [ 103a ]
    text = re.sub(r'\[\s*\d+[a-zA-Z]\s*\]', '', text)

    # for [ A ] [ B ]
    text = re.sub(r'\[\s*[A-Za-z]\s*\]', '', text)

    # for [ Greek: kekalummenon ] [ Greek: tê Zachariou marturia ]
    #text = re.sub(r'\[\s*Greek:\s*[^\]]+\s*\]', '', text)

    # change three or more ? to two
    text = re.sub(r'\?{3,}', '??', text)

    # strip of µ and ß
    #text = re.sub(r'[µß]', '', text)

    # weird greek stuff [ ) e  ] [  = i  ] [  = o  ] [  = i  ] [ ) e  ]
    text = re.sub(r'\[\s*[=\)]\s*[a-zA-Z]\s*\]', '', text)

    # standartise the brackets for missing words to [... ]
    text = re.sub(r'\[\s*[\*\.\s]+\s*\]', '[... ]', text)


    '''
    # print out everything else remaining in brackets but the footnotes
    matches = re.findall(r'\[([^\]]*)\]', text)  # Match anything inside square brackets
    for match in matches:
        if not (re.search(r'\b(Footnote|Transcriber|Note|Endnote|Footenote)\b', match, re.IGNORECASE) or
                re.match(r'\s*\d{1,3}:\d\s*', match) or                           # [  13:1  ] [  123:1  ] [  1:1  ]
                re.match(r'\s*\.\.\.\s*', match) or                               # [... ] [ ...  ]
                re.match(r'\s*\*+\s*', match)                                     # [  **  ] [  *  ]
        ):
            print(f"Removing: [ {match} ]")
            # Remove the entire bracket including content
            #text = re.sub(re.escape(f'[{match}]'), '', text, count=1)
    '''
    
    # Remove any leftover empty pairs of brackets
    text = re.sub(r'\[\s*\]', '', text)

    
    return text

In [5]:
# test of the data cleaning function
sample_text = """
ß1. ß 5. ß105
[ 9 ] [ 10 ]
} 2 { } 99 { } 8a { } 8b { } 8c {
<page type="Page 50"/>
* * * * *
IIIToC VIToC IToC

Multiple\n\n\nnewlines.

[ Footnote 2: Mrs. Byron died August I, 1811. ]

[ Footnote 3: For R. C. Dallas, see ` Letters ', vol.
i. p. 168, note 1.
[ Footnote 1 to Letter 87 ] ]

[ bhabka ]

[ Illustration ]

[ Illustration: MRS SIDDONS and MR KEMBLE as Mr. & Mrs. Beverley Act 5.
Sc.
4.
Bev.
O!
for a few short Moments to tell you how my Heart bleeds for you. ]

[  pg 154  ] [  Pg 10  ] [  1  ]

[ Pg 50 ] [ pg 35 ] [ 1 ] [ 999 ]

[  Transcriber 's note: ` curosity ' in original  ]
[ * Note: See Appendix II. ]

[  13:1  ] [  123:1  ] [  1:1  ]

[  ENDNOTES  ] [  Endnote 1:1  ]

[... ] [ ...  ]

[  **  ] [  *  ]

to add: [  11a  ] [  11b  ] [  103a  ] [  A  ] [  B  ] [  Greek: kekalummenon  ] [  Greek: tê Zachariou marturia  ]
µ ß?

?????? (more than two ? i a row)

[ ) e  ] [  = i  ] [  = o  ] [  = i  ] [ ) e  ]
[] [

Done
"""

cleaned = clean_text(sample_text)
print("\nCleaned Text:")
print(cleaned)


Cleaned Text:
Multiple
newlines.
[ Footnote 2: Mrs. Byron died August I, 1811. ]
[ Footnote 3: For R. C. Dallas, see ` Letters ', vol.
i. p. 168, note 1.
[ Footnote 1 to Letter 87 ] ]
[ bhabka ]

[ Illustration: MRS SIDDONS and MR KEMBLE as Mr. & Mrs. Beverley Act 5.
Sc.
4.
Bev.
O!
for a few short Moments to tell you how my Heart bleeds for you. ]
  
   
[  Transcriber 's note: ` curosity ' in original  ]
[ * Note: See Appendix II. ]
[  13:1  ] [  123:1  ] [  1:1  ]
[  ENDNOTES  ] [  Endnote 1:1  ]
[... ] [... ]
[... ] [... ]
to add:      [  Greek: kekalummenon  ] [  Greek: tê Zachariou marturia  ]
µ ß?
?? (more than two ? i a row)
    
 [
Done


In [6]:
df_metadata['cleaned_text'] = df_metadata.apply(
    lambda row: print(f"Processing ID: {row['ID']}") or clean_text(row['text']), axis=1
)

Processing ID: histcorp-english-clmet-CLMET3_1_3_277
Processing ID: histcorp-english-clmet-CLMET3_1_3_263
Processing ID: histcorp-english-clmet-CLMET3_1_3_288
Processing ID: histcorp-english-clmet-CLMET3_1_1_20
Processing ID: histcorp-english-clmet-CLMET3_1_1_34
Processing ID: histcorp-english-clmet-CLMET3_1_3_303
Processing ID: histcorp-english-clmet-CLMET3_1_2_110
Processing ID: histcorp-english-clmet-CLMET3_1_3_317
Processing ID: histcorp-english-clmet-CLMET3_1_2_104
Processing ID: histcorp-english-clmet-CLMET3_1_2_138
Processing ID: histcorp-english-clmet-CLMET3_1_2_139
Processing ID: histcorp-english-clmet-CLMET3_1_2_105
Processing ID: histcorp-english-clmet-CLMET3_1_3_316
Processing ID: histcorp-english-clmet-CLMET3_1_2_111
Processing ID: histcorp-english-clmet-CLMET3_1_3_302
Processing ID: histcorp-english-clmet-CLMET3_1_1_35
Processing ID: histcorp-english-clmet-CLMET3_1_1_21
Processing ID: histcorp-english-clmet-CLMET3_1_3_289
Processing ID: histcorp-english-clmet-CLMET3_1_3_2

In [7]:
df = df_metadata.copy()

### Data Split

In [8]:
def assign_year_range(year):
    if 1710 <= year < 1780:
        return "[1710-1780]"
    elif 1780 <= year < 1850:
        return "[1780-1850]"
    elif 1850 <= year <= 1920:  # Adjust for any future range
        return "[1850-1920]"
    else:
        return "Unknown"  # If the year does not fit in the defined ranges


df.loc[:, 'printedDate'] = pd.to_numeric(df['printedDate'], errors='coerce')
df.loc[:, 'yearRange'] = df['printedDate'].apply(assign_year_range)
df = df[df['yearRange'] != 'Unknown']  # Make sure this is done on the DataFrame itself
df.loc[:, 'words'] = pd.to_numeric(df['words'].replace({',': ''}, regex=True), errors='coerce')
word_counts_by_range = df.groupby('yearRange')['words'].sum().to_dict()
df.loc[:, 'totalWordsInRange'] = df['yearRange'].map(word_counts_by_range)
df.loc[:, 'weight'] = df['words'] / df['totalWordsInRange']

In [9]:
def split_by_weight(df, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, random_seed=50):
    """
    Splits the dataframe into train, validation, and test sets based on weights.

    Args:
        df (pd.DataFrame): Dataframe containing the texts to split.
        train_ratio (float): Proportion of data for training.
        val_ratio (float): Proportion of data for validation.
        test_ratio (float): Proportion of data for testing.
        random_seed (int): Seed for reproducibility.

    Returns:
        tuple: Three dataframes - train, validation, and test.
    """
    # Shuffle the dataframe
    df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    # Calculate cumulative weights
    df['cumulative_weight'] = df['weight'].cumsum()
    total_weight = df['weight'].sum()

    # Define weight thresholds
    train_threshold = total_weight * train_ratio
    val_threshold = total_weight * (train_ratio + val_ratio)

    # Split based on thresholds
    train_df = df[df['cumulative_weight'] <= train_threshold]
    val_df = df[(df['cumulative_weight'] > train_threshold) & (df['cumulative_weight'] <= val_threshold)]
    test_df = df[df['cumulative_weight'] > val_threshold]

    # Drop cumulative weight column before returning
    train_df = train_df.drop(columns=['cumulative_weight'])
    val_df = val_df.drop(columns=['cumulative_weight'])
    test_df = test_df.drop(columns=['cumulative_weight'])

    return train_df, val_df, test_df

In [10]:
train_dfs = []
val_dfs = []
test_dfs = []

for year_range in df['yearRange'].unique():
    subset = df[df['yearRange'] == year_range]
    train_split, val_split, test_split = split_by_weight(subset)
    train_dfs.append(train_split)
    val_dfs.append(val_split)
    test_dfs.append(test_split)

# Combine splits from all year ranges
train_df = pd.concat(train_dfs).reset_index(drop=True)
val_df = pd.concat(val_dfs).reset_index(drop=True)
test_df = pd.concat(test_dfs).reset_index(drop=True)


# check the split
def word_count_matrix(df_train, df_val, df_test):
    # Group by 'yearRange' and sum 'words' in each split
    train_words = df_train.groupby('yearRange')['words'].sum()
    val_words = df_val.groupby('yearRange')['words'].sum()
    test_words = df_test.groupby('yearRange')['words'].sum()

    total_words = train_words + val_words + test_words

    # Create a matrix (DataFrame) for easier visualization
    matrix = pd.DataFrame({
        'Train': train_words,
        'Validation': val_words,
        'Test': test_words,
        'Train %': round(100*train_words/total_words, 3),
        'Validation %': round(100*val_words/total_words, 3),
        'Test %': round(100*test_words/total_words, 3),
    }).fillna(0)  # Fill NaN with 0 for year ranges that may not appear in a particular split

    return matrix

# Get the word count matrix
word_count_matrix_df = word_count_matrix(train_df, val_df, test_df)

print(word_count_matrix_df)

                Train  Validation     Test    Train %  Validation %     Test %
yearRange                                                                     
[1710-1780]   6866089      809985   946891  79.625616      9.393347  10.981037
[1780-1850]   8664774      899176  1315599  79.642768      8.264828  12.092404
[1850-1920]  11162763     1398787  1407057  79.913215     10.013790  10.072994


  matrix = pd.DataFrame({
