# Network analysis CCLAMP

In [1]:
#import packages
import pandas as pd
import numpy as np
import networkx as nx

In [3]:
# Read metadata file with the authors
metadata_df = pd.read_csv("C-CLAMP_metadata_gender.txt", sep="\t", encoding='utf-8')

# Create list of authors

In [5]:
# Create list of all the authors
import re
from itertools import chain

# Use str.split to split the strings and explode to create separate rows
df_cleaned = metadata_df['Author'].str.split(';', expand=True).stack()

# Reset index to clean up
df_cleaned = df_cleaned.reset_index(drop=True)

# Drop missing values
df_cleaned = df_cleaned.dropna()

# Remove duplicates
df_cleaned = df_cleaned.drop_duplicates()

# Sort the values
df_cleaned = df_cleaned.sort_values().reset_index(drop=True)

# Create a new DataFrame for the authors
Author_df = pd.DataFrame({'Author': df_cleaned})

print(Author_df)

                      Author
0             't Haakstertje
1                   A. Aerts
2           A. Agnes Sneller
3                 A. Alberts
4                    A. Ampe
...                      ...
10519           van der Veur
10520    zuster Maria Jozefa
10521            Ágnes Roboz
10522  Émile O.J.J.G. Lousse
10523        Émile Verhaeren

[10524 rows x 1 columns]


In [9]:
# save author list
Author_df.to_csv("author_list.txt", index=False, sep='\t', encoding='utf-8')

In [11]:
# make list of all authors and their birth dates

# Select Author and DOB
metadata = metadata_df[['Author', 'DOB']]

# Flatten the dataframe
flattened_data = []

for index, row in metadata.iterrows():
    authors = str(row['Author']).split('; ')
    dobs = str(row['DOB']).split('; ')
    
    for author, dob in zip(authors, dobs):
        flattened_data.append({'Author': author, 'DOB': dob})

flattened_df = pd.DataFrame(flattened_data)

# Replace 'NA' or 'nan' with NaN
flattened_df.replace(['NA', 'nan'], np.nan, inplace=True)

# Remove duplicate rows
authors_dob_df = flattened_df.drop_duplicates()

# Display the flattened dataframe
print(authors_dob_df)


                   Author              DOB
0                     NaN              NaN
1            Jan Engelman      7 juni 1900
2             Willem Maas    28 april 1897
3            Joep Nicolas   6 oktober 1897
5           Albert Helman  7 november 1903
...                   ...              ...
62928        A. De Geyter       20ste eeuw
62929            J. Hoing       20ste eeuw
62930       Flor Kielbaey       20ste eeuw
62931  Hendrik Imberechts    13 april 1922
62932            H. Aerts       20ste eeuw

[8798 rows x 2 columns]


In [13]:
authors_dob_df.to_csv("author_DOB_list.txt", index=False, sep='\t', encoding='utf-8')
authors_dob_df.head(20)

Unnamed: 0,Author,DOB
0,,
1,Jan Engelman,7 juni 1900
2,Willem Maas,28 april 1897
3,Joep Nicolas,6 oktober 1897
5,Albert Helman,7 november 1903
6,Willem Nieuwenhuis,1886
7,Karel van den Oever,19 november 1879
9,Wies Moens,28 januari 1898
10,Eduard A. Serrarens,27 december 1895
11,Henk Kuitenbrouwer,20 november 1903


# Read files and extract all mentions of authors

In [4]:
# read file and extract all mentions of authors
import re

def find_author_mentions(file_name, full_names_df):
    """
    Finds mentions of authors in a text file.

    Args:
        file_name (str): The name of the file.
        full_names_df (pd.DataFrame): DataFrame with full names.
        last_names_df (pd.DataFrame): DataFrame with last names.

    Returns:
        list: List of author mentions found in the text.
    """
    # read file
    with open(file_name, 'r', encoding='utf-8') as file:
        text_content = file.read()
    
    # extract mentions
    author_mentions = []
    
    full_name_column = full_names_df.columns[0]
    
    for index, row in full_names_df.iterrows():
        full_name = row[full_name_column]
        if full_name in text_content:
            author_mentions.append(full_name)
    
    # clean the mentions
    author_mentions = list(set(author_mentions))
    
    # return the mentions
    return author_mentions



In [5]:
# Exract all the mentions from the whole corpus
from tqdm import tqdm
import os

# Create an empty DataFrame to store mentions
mentions_df = pd.DataFrame()

# Filter files with authors
files_with_authors = metadata_df[metadata_df['Author'].notna()]

# Define the total number of iterations (filtered files)
total_iterations = len(files_with_authors)

# Get the current working directory
current_directory = os.getcwd()

# Define the directory path where the files are located (relative to the script)
directory_path = os.path.join(current_directory, 'corpus')

# Define the output file path
output_file = 'mentions_output.txt'

# Loop over each file in the corpus with progress bar
for index, row in tqdm(files_with_authors.iterrows(), total=total_iterations, desc="Processing Files"):
    file_name = row['File']
    file_path = os.path.join(directory_path, f'{file_name}.txt')  # Construct full file path
    if os.path.exists(file_path):  # Check if the file exists
        authors = find_author_mentions(file_path, Author_df)
        if authors:
            # Put mentions into a DataFrame
            m = {'File': file_name, 'Mentions': authors}
            temp_df = pd.DataFrame(m)
            mentions_df = pd.concat([mentions_df, temp_df], ignore_index=True)

# Print the final DataFrame
print(mentions_df)

# Write the mentions DataFrame to a .txt file
mentions_df.to_csv(output_file, index=False, sep='\t', encoding='utf-8')

Processing Files: 100%|████████████████████████████████████████████████████████| 40048/40048 [9:35:31<00:00,  1.16it/s]


               File             Mentions
0        GEM_1925_3         Joep Nicolas
1        GEM_1925_5   Hendrik Andriessen
2        GEM_1925_6   Willem Nieuwenhuis
3        GEM_1925_7  Karel van den Oever
4        GEM_1925_9           Wies Moens
...             ...                  ...
138371  STR_1947_99             Henricus
138372  STR_1947_99           M. Huybens
138373  STR_1947_99             R. Leijs
138374  STR_1947_99                 Léon
138375  STR_1947_99             Horatius

[138376 rows x 2 columns]


# Add mentions to authors

In [19]:
# read all mentions
all_mentions_df = pd.read_csv("mentions_output.txt", sep="\t", encoding='utf-8')
print(all_mentions_df)

FileNotFoundError: [Errno 2] No such file or directory: 'mentions_output.txt'

In [4]:
# Merge all_mentions_df and metadata_df
mentions_and_authors_df = pd.merge(all_mentions_df, metadata_df, on='File', how='inner')  # 'inner' means keep only common rows

print(mentions_and_authors_df)

# save output
mentions_and_authors_df.to_csv("mentions_and_authors.txt", index=False, sep='\t', encoding='utf-8')

               File             Mentions  Year  \
0        GEM_1925_3         Joep Nicolas  1925   
1        GEM_1925_5   Hendrik Andriessen  1925   
2        GEM_1925_6   Willem Nieuwenhuis  1925   
3        GEM_1925_7  Karel van den Oever  1925   
4        GEM_1925_9           Wies Moens  1925   
...             ...                  ...   ...   
138371  STR_1947_99             Henricus  1947   
138372  STR_1947_99           M. Huybens  1947   
138373  STR_1947_99             R. Leijs  1947   
138374  STR_1947_99                 Léon  1947   
138375  STR_1947_99             Horatius  1947   

                                                    Title  \
0       Sint Maartensavond Bij een linoleumsnede van J...   
1       Is de nieuwe meerstemmige kerkmuziek in Nederl...   
2                 Dagelijksch brood en dagelijksch leven.   
3                                     In Memoriam Fratris   
4                                                 Koraal.   
...                              

# Create network

In [15]:
# create coauthor x coauthor pairs

from itertools import combinations

# Create coauthors_df 
coauthors_df = metadata_df[['Author']].copy()

# Filter out rows with only 1 author
df_filtered = coauthors_df[coauthors_df['Author'].str.count(';') > 0]

# Function to get combinations of authors
def get_author_combinations(row):
    authors = row['Author'].split('; ')
    return list(combinations(authors, 2))  # Change the number inside combinations() for different combinations

# Apply the function to each row: this will create a new column 'Author Combinations' with a list of combinations for each row
df_filtered['Author Combinations'] = df_filtered.apply(get_author_combinations, axis=1)

# Create an empty list to store the combinations
combinations_list = []

# Iterate through each row and extract combinations
for _, row in df_filtered.iterrows():
    combinations_list.extend(row['Author Combinations'])

coauthor_combinations_df = pd.DataFrame(combinations_list, columns=['Target', 'Source'])
coauthor_combinations_reverse_df = pd.DataFrame(combinations_list, columns=['Source', 'Target'])
print(coauthor_combinations_df)
print(coauthor_combinations_reverse_df)


                   Target              Source
0            Jan Engelman         Willem Maas
1      Henk Kuitenbrouwer           Jan Bruna
2            Jan Engelman         Willem Maas
3            Jan Engelman        Albert Kuyle
4            Jan Engelman           Jan Bruna
...                   ...                 ...
30957            J. Hoing  Hendrik Imberechts
30958            J. Hoing            H. Aerts
30959       Flor Kielbaey  Hendrik Imberechts
30960       Flor Kielbaey            H. Aerts
30961  Hendrik Imberechts            H. Aerts

[30962 rows x 2 columns]
                   Source              Target
0            Jan Engelman         Willem Maas
1      Henk Kuitenbrouwer           Jan Bruna
2            Jan Engelman         Willem Maas
3            Jan Engelman        Albert Kuyle
4            Jan Engelman           Jan Bruna
...                   ...                 ...
30957            J. Hoing  Hendrik Imberechts
30958            J. Hoing            H. Aerts
30959   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Author Combinations'] = df_filtered.apply(get_author_combinations, axis=1)


In [19]:
# Create mention x author pairs 

# ADDED: reread mentions_and_authors.txt
mentions_and_authors_df = pd.read_csv("mentions_and_authors.txt", sep="\t", encoding='utf-8')

mentions_and_authors_df.to_csv("mentions_and_authors.txt", index=False, sep='\t', encoding='utf-8')

# Splitting authors column
authors_split = mentions_and_authors_df['Author'].str.split('; ', expand=True).stack()

# Dropping the original authors column
mentions_and_authors_df.drop('Author', axis=1, inplace=True)

# Restoring the index alignment
authors_split.index = authors_split.index.droplevel(1)

# Combining the DataFrame with split authors and original DataFrame
mentions_and_authors_df = mentions_and_authors_df.join(authors_split.rename('Author'))

print(mentions_and_authors_df)


               File             Mentions  Year  \
0        GEM_1925_3         Joep Nicolas  1925   
1        GEM_1925_5   Hendrik Andriessen  1925   
2        GEM_1925_6   Willem Nieuwenhuis  1925   
3        GEM_1925_7  Karel van den Oever  1925   
4        GEM_1925_9           Wies Moens  1925   
...             ...                  ...   ...   
138375  STR_1947_99             Horatius  1947   
138375  STR_1947_99             Horatius  1947   
138375  STR_1947_99             Horatius  1947   
138375  STR_1947_99             Horatius  1947   
138375  STR_1947_99             Horatius  1947   

                                                    Title  \
0       Sint Maartensavond Bij een linoleumsnede van J...   
1       Is de nieuwe meerstemmige kerkmuziek in Nederl...   
2                 Dagelijksch brood en dagelijksch leven.   
3                                     In Memoriam Fratris   
4                                                 Koraal.   
...                              

In [21]:
# create dataframe pairs_df with only the authors and the mentions
pairs_df = mentions_and_authors_df[['Mentions', 'Author']]

# Rename to target and source
pairs_df = pairs_df.rename(columns={'Mentions': 'Target', 'Author': 'Source'})

# Remove rows where mention and author have the same value
pairs_df = pairs_df[pairs_df['Source'] != pairs_df['Target']]

# Add the coauthors
pairs_df = pd.concat([pairs_df, coauthor_combinations_df], ignore_index=True)
pairs_df = pd.concat([pairs_df, coauthor_combinations_reverse_df], ignore_index=True)

# replace name of specific author
pairs_df['Target'] = pairs_df['Target'].replace('Suze la Chapelle-Roobol', 'Suze La Chapelle-Roobol')
pairs_df['Source'] = pairs_df['Source'].replace('Suze la Chapelle-Roobol', 'Suze La Chapelle-Roobol')

# Initialize weights based on direction
pairs_df['Weight'] = 1

# Increment weights for mentions as targets
pairs_df.loc[pairs_df['Source'] == pairs_df['Target'], 'Weight'] += 1

# Identify duplications
pairs_df = pairs_df.groupby(['Target', 'Source'], sort=False, as_index=False)['Weight'].sum()

pairs_df.head(50)

Unnamed: 0,Target,Source,Weight
0,Hendrik Andriessen,Albert Helman,3
1,Henri Bruning,Henk Kuitenbrouwer,12
2,Henri Bruning,Jan Bruna,3
3,Joep Nicolas,Henk Kuitenbrouwer,3
4,Joep Nicolas,Jan Bruna,2
5,Albert Kuyle,Henk Kuitenbrouwer,37
6,Albert Kuyle,Jan Bruna,3
7,Albe,Henk Kuitenbrouwer,25
8,Albe,Jan Bruna,2
9,Carel Scharten,Jan Engelman,2


# Add birth dates of the targets to the dataframe
ADAPTED: FULLY CLEANED METADATA READ IN

In [23]:
pairs_df

Unnamed: 0,Target,Source,Weight
0,Hendrik Andriessen,Albert Helman,3
1,Henri Bruning,Henk Kuitenbrouwer,12
2,Henri Bruning,Jan Bruna,3
3,Joep Nicolas,Henk Kuitenbrouwer,3
4,Joep Nicolas,Jan Bruna,2
...,...,...,...
170956,Flor Kielbaey,A. De Geyter,1
170957,Hendrik Imberechts,A. De Geyter,1
170958,Flor Kielbaey,J. Hoing,1
170959,Hendrik Imberechts,J. Hoing,1


In [25]:
import pandas as pd

# Load the authors' DOB data
authors_dob_df = pd.read_csv("author_metadata_hisclass_final.txt", sep="\t", encoding='utf-8')
authors_dob_df = authors_dob_df[['Author', 'birthDate']]

# First merge: Get Target_DOB by merging on 'Target' (pairs_df) and 'Author' (authors_dob_df)
merged_pairs_df = pd.merge(pairs_df, authors_dob_df[['Author', 'birthDate']], left_on='Target', right_on='Author', how='left')

# Rename the 'DOB' column to 'Target_DOB' for clarity
merged_pairs_df = merged_pairs_df.rename(columns={'birthDate': 'Target_DOB'})

# Second merge: Get Source_DOB by merging on 'Source' (pairs_df) and 'Author' (authors_dob_df)
merged_pairs_df = pd.merge(merged_pairs_df, authors_dob_df[['Author', 'birthDate']], left_on='Source', right_on='Author', how='left')

# Rename the 'DOB' column from the second merge to 'Source_DOB'
merged_pairs_df = merged_pairs_df.rename(columns={'birthDate': 'Source_DOB'})

# Drop the 'Author' columns (from both merges) as they are no longer needed
merged_pairs_df = merged_pairs_df.drop(columns=['Author_x', 'Author_y'])

# Display the final result
print(merged_pairs_df)



                    Target              Source  Weight  Target_DOB  Source_DOB
0       Hendrik Andriessen       Albert Helman       3  1892-09-17  1903-11-07
1            Henri Bruning  Henk Kuitenbrouwer      12  1900-07-10  1903-11-20
2            Henri Bruning           Jan Bruna       3  1900-07-10  1876-08-17
3             Joep Nicolas  Henk Kuitenbrouwer       3  1897-10-06  1903-11-20
4             Joep Nicolas           Jan Bruna       2  1897-10-06  1876-08-17
...                    ...                 ...     ...         ...         ...
173896       Flor Kielbaey        A. De Geyter       1  19xx-xx-xx  19xx-xx-xx
173897  Hendrik Imberechts        A. De Geyter       1  1922-04-13  19xx-xx-xx
173898       Flor Kielbaey            J. Hoing       1  19xx-xx-xx  19xx-xx-xx
173899  Hendrik Imberechts            J. Hoing       1  1922-04-13  19xx-xx-xx
173900  Hendrik Imberechts       Flor Kielbaey       1  1922-04-13  19xx-xx-xx

[173901 rows x 5 columns]


# Filter the network by Target_DOB

In [27]:
import re
import pandas as pd

# Identify rows to drop
rows_to_drop = []

# Iterate over the DataFrame
for index, row in merged_pairs_df.iterrows():
    dob_value = row['Target_DOB']

    if pd.isna(dob_value):
        # Skip rows with NaN values in 'DOB'
        continue

    else:
        year_match = re.search(r'(\d{2})(\w{2})', dob_value)
        if year_match:
            century = int(year_match.group(1))
            if century >= 17:
                continue
            rows_to_drop.append(index)        

# Drop identified rows from the DataFrame
merged_pairs_df = merged_pairs_df.drop(rows_to_drop).reset_index(drop=True)

# Filter the network by Source_DOB

In [29]:
import re
import pandas as pd

# Identify rows to drop
rows_to_drop = []

# Iterate over the DataFrame
for index, row in merged_pairs_df.iterrows():
    dob_value = row['Source_DOB']
    
    if pd.isna(dob_value):
        # Skip rows with NaN values in 'DOB'
        continue

    else:
        year_match = re.search(r'(\d{2})(\w{2})', dob_value)
        if year_match:
            century = int(year_match.group(1))
            if century >= 17:
                continue
            rows_to_drop.append(index)

# Drop identified rows from the DataFrame
merged_pairs_df = merged_pairs_df.drop(rows_to_drop).reset_index(drop=True)

In [31]:
merged_pairs_df.head(100)

Unnamed: 0,Target,Source,Weight,Target_DOB,Source_DOB
0,Hendrik Andriessen,Albert Helman,3,1892-09-17,1903-11-07
1,Henri Bruning,Henk Kuitenbrouwer,12,1900-07-10,1903-11-20
2,Henri Bruning,Jan Bruna,3,1900-07-10,1876-08-17
3,Joep Nicolas,Henk Kuitenbrouwer,3,1897-10-06,1903-11-20
4,Joep Nicolas,Jan Bruna,2,1897-10-06,1876-08-17
...,...,...,...,...,...
95,Bernard Verhoeven,Henk Kuitenbrouwer,5,1897-04-29,1903-11-20
96,Leopold,Albert Helman,2,1860-09-07,1903-11-07
97,Leopold,Jan Bruna,1,1860-09-07,1876-08-17
98,Leopold,Jan Engelman,8,1860-09-07,1900-06-07


# Extra filtering step for specific authors

In [33]:
# Throw out: Leopold, Peeters, Hubert, Henricus, Ian, Homerus, Horatius, Léon, and the journals
# ADDED: Throw out authors with non-unique names

filter_list_expanded = ["Leopold", "Peeters", "Hubert", "Henricus", "Ian", "Homerus", "Horatius", "Léon", "Constantijn",
                        "[tijdschrift] Dietsche Warande en Belfort", "[tijdschrift] Vaderlandsche Letteroefeningen",
                        "[tijdschrift] Van Onzen Tijd", "[tijdschrift] Nieuw Vlaams Tijdschrift", "[tijdschrift] Ontmoeting",
                        "C. van Vollenhoven", "Pieter Rutger Feith", "S. Davids", "Suze La Chapelle-Roobol", "P. Hoekstra",
                        "Peter van der Veer", "Peter van Steen", "M.C. Tideman", "Marten Brouwer", "Martinus Nijhoff",
                        "Johan Winkler", "Jozef van Mierlo", "J.M. van Bemmelen", "Jan de Jong", "J.J. Belinfante", "Hieronymus van Alphen",
                        "J.D. van der Waals", "H.J. Kiewiet de Jonge", "F.E.J. Malherbe", "E.L. Levie", "Dirk Bax", "C. de Waal",
                        "C. Kramer", "Arie de Froe", "Chr.P. van Eeghen", "Anne de Vries", "Allard Pierson", "A.A. van Schelven", "J. Brants"]

# Create a boolean mask for rows to keep
mask = ~(merged_pairs_df['Target'].isin(filter_list_expanded) | merged_pairs_df['Source'].isin(filter_list_expanded))

# Apply the mask to keep only the rows that don't match the filter_list
filtered_df = merged_pairs_df[mask]

filtered_df.head(50)

Unnamed: 0,Target,Source,Weight,Target_DOB,Source_DOB
0,Hendrik Andriessen,Albert Helman,3,1892-09-17,1903-11-07
1,Henri Bruning,Henk Kuitenbrouwer,12,1900-07-10,1903-11-20
2,Henri Bruning,Jan Bruna,3,1900-07-10,1876-08-17
3,Joep Nicolas,Henk Kuitenbrouwer,3,1897-10-06,1903-11-20
4,Joep Nicolas,Jan Bruna,2,1897-10-06,1876-08-17
5,Albert Kuyle,Henk Kuitenbrouwer,37,1904-02-17,1903-11-20
6,Albert Kuyle,Jan Bruna,3,1904-02-17,1876-08-17
7,Albe,Henk Kuitenbrouwer,25,1902-06-08,1903-11-20
8,Albe,Jan Bruna,2,1902-06-08,1876-08-17
9,Carel Scharten,Jan Engelman,2,1878-03-14,1900-06-07


# Save the network

In [35]:
# Write the network to a .txt file
filtered_df.to_csv("CCLAMP_Directed_Network_update.txt", index=False, sep='\t', encoding='utf-8')