In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from tqdm import tqdm
from glob import glob
from matplotlib import pyplot as plt
import json
from collections import Counter
from IPython.display import clear_output
from scipy.stats import pearsonr, spearmanr
import csv

In [2]:
%env NX_CUGRAPH_AUTOCONFIG=True
import networkx as nx

env: NX_CUGRAPH_AUTOCONFIG=True


  backend_info.update(_get_backends("networkx.backend_info", load_and_call=True))


In [4]:
def proc(filename):
    with open(filename) as file:
        data=json.load(file)
    return data

In [5]:
def create_character_data(data, printTop):
    character_data = []
    
    for character in data["characters"]:
    
        agentList=" ".join([i["w"].lower() for i in character["agent"]])
        patientList=" ".join([i["w"].lower() for i in character["patient"]])
        modList=" ".join([i["w"] for i in character["mod"] if len(character["mod"]) > 0])
    
        character_id=character["id"]
        count=character["count"]
    
        referential_gender_distribution=referential_gender_prediction="unknown"
    
        if character["g"] is not None and character["g"] != "unknown":
            referential_gender_distribution=character["g"]["inference"]
            referential_gender=character["g"]["argmax"]
    
        mentions=character["mentions"]
        proper_mentions=mentions["proper"]
        max_proper_mention=""
        role = ""
        if len(proper_mentions) > 0:
            max_proper_mention = mentions["proper"][0]["n"]
            role = "proper"
        elif len(mentions["common"]) > 0:
            max_proper_mention = mentions["common"][0]["n"]
            role = "common"
        elif len(mentions["pronoun"]) > 0 and mentions["pronoun"][0]["n"] in ["I", "My", "my", "mine"]:
            max_proper_mention = "I"
            role = "self"
            
        temp={"id": character["id"],
              "name": max_proper_mention,
              "gender": referential_gender,
              "mod": modList,
              "agent_list": agentList,
              "patient_list": patientList,
              "count": count,
              "role": role}    
    
        character_data.append(temp)
    
    character_data = pd.DataFrame(character_data)
    character_data = character_data[character_data["count"] >= printTop]
    return character_data
    
def get_event_character_df(book, folder, count_threshold = 1):
    
    book_file = "{}/{}/{}.book".format(folder,book,book)
    data = proc(book_file)
    
    token_file = "{}/{}/{}.tokens".format(folder,book,book)
    token_df = pd.read_csv(token_file, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
    events_df = token_df.copy()

    # Processing patient data
    patient_df = []
    for cha in data["characters"]:
        for i in cha["patient"]:
            cha_patient_temp = {"patient": cha["id"],
                                "event": i["w"],
                                "event_id": i["i"]}
            patient_df.append(cha_patient_temp)
    patient_df = pd.DataFrame(patient_df)
    
    # Processing agent data
    agent_df = []
    for cha in data["characters"]:
        for i in cha["agent"]:
            cha_agent_temp = {"agent": cha["id"],
                                "event": i["w"],
                                "event_id": i["i"]}
            agent_df.append(cha_agent_temp)
    agent_df = pd.DataFrame(agent_df)

    # Combining with events data
    events_df = events_df.merge(patient_df[["event_id", "patient"]], how="left", left_on="token_ID_within_document", right_on="event_id")
    events_df = events_df.merge(agent_df[["event_id", "agent"]], how="left", left_on="token_ID_within_document", right_on="event_id")
    events_df = events_df[~events_df[["agent", "patient"]].isnull().any(axis=1)]
    events_df = events_df.drop(["event_id_x", "event_id_y"], axis=1)
    events_df = events_df[~(events_df.agent == events_df.patient)] # filter out events that agent and patient are the same

    # extract characters
    character_data = create_character_data(data, count_threshold)
    character_data = character_data[~(character_data.name=='')] # filter out characters that do not have a name

    # Combine events with characters
    events_df = events_df.merge(character_data[["id", "name"]].add_prefix('agent_'), how="left", left_on="agent", right_on="agent_id")
    events_df = events_df.merge(character_data[["id", "name"]].add_prefix('patient_'), how="left", left_on="patient", right_on="patient_id")
    events_df = events_df.dropna(subset=["agent_name", "patient_name"])
    
    character_data_new = character_data.drop_duplicates(subset=["name"]).reset_index(drop=True)
    character_data_new["mod"] = character_data_new["name"].map(character_data.groupby("name")["mod"].apply(list).apply(lambda x: " ".join(x)))
    character_data_new["agent_list"] = character_data_new["name"].map(character_data.groupby("name")["agent_list"].apply(list).apply(lambda x: " ".join(x)))
    character_data_new["patient_list"] = character_data_new["name"].map(character_data.groupby("name")["patient_list"].apply(list).apply(lambda x: " ".join(x)))
    events_df = events_df[~(events_df.agent_name==events_df.patient_name)]

    return character_data_new, events_df, token_df

def get_network_metrics(G0):
    
    n_nodes = G0.number_of_nodes()
    n_edges = G0.number_of_edges()
    network_density = nx.density(G0)
    
    if n_nodes > 1 and n_edges > 1:

        # Average Clustering
        average_clustering = nx.average_clustering(G0, weight="weight")

        # Modularity
        communities = nx.community.louvain_communities(G0, weight="weight")
        modularity = nx.community.modularity(G0,communities, weight="weight" )

        # Shortest Path Length
        average_shortest_path_largest_component = nx.average_shortest_path_length(G0, weight=None)
            

        network_metrics = {"n_nodes": n_nodes,
                           "n_edges": n_edges,
                           "network_density": network_density,
                           "average_clustering": average_clustering,
                           "modularity": modularity,
                           "average_shortest_path_largest_component": average_shortest_path_largest_component}

    else:
        network_metrics = {"n_nodes": n_nodes,
                           "n_edges": n_edges,
                           "network_density": np.nan,
                           "average_clustering": np.nan,
                           "modularity": np.nan,
                           "average_shortest_path_largest_component": np.nan}
    return network_metrics

In [22]:
meta_df = pd.read_csv("./gutenberg_meta_df.csv")
books = meta_df.id.unique()

books_list = []
for b in tqdm(books):
    outputDir = "../../gutenberg_standard/new_analysis/book_nlp_output_small/{}/".format(b)
    idd = b
    if os.path.exists(outputDir + "{}.book".format(b)):
        books_list.append(b)

meta_df = meta_df[meta_df.id.isin(books_list)]
meta_df=meta_df.drop_duplicates(subset=["id"])

books = meta_df.id.unique()
print(len(books))

# Load Events Data and Character Data

In [29]:
events_df_dict = {}
characters_df_dict = {}
token_df_df_dict = {}
for book in tqdm(books):
    try:
        character_data, events_df, token_df = get_event_character_df(book, "../../gutenberg_standard/new_analysis/book_nlp_output_small/", count_threshold=1)
        events_df_dict[book] = events_df
        characters_df_dict[book] = character_data
        token_df_df_dict[book] = token_df
    except Exception as e:
        print(e)
        pass

 75%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                | 8112/10879 [21:12<04:28, 10.30it/s]

cannot access local variable 'referential_gender' where it is not associated with a value


 77%|██████████████████████████████████████████████████████████████████████████████████████████████████                              | 8333/10879 [21:54<04:49,  8.79it/s]

cannot access local variable 'referential_gender' where it is not associated with a value


 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 9069/10879 [23:49<03:56,  7.66it/s]

cannot access local variable 'referential_gender' where it is not associated with a value


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10879/10879 [29:16<00:00,  6.19it/s]


In [33]:
N_tokens_dict = {}
N_sentences_dict = {}
for book in tqdm(token_df_df_dict.keys()):
    token_df = token_df_df_dict[book]
    N_tokens = token_df.token_ID_within_document.max() + 1
    N_sentences = token_df.sentence_ID.max() + 1
    N_tokens_dict[book] = N_tokens
    N_sentences_dict[book] = N_sentences

meta_df["N_tokens"] = meta_df.id.map(N_tokens_dict)
meta_df["log_N_tokens"] = np.log(meta_df["N_tokens"])
meta_df["N_sentences"] = meta_df.id.map(N_sentences_dict)
meta_df["log_N_sentences"] = np.log(meta_df["N_sentences"])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10876/10876 [00:00<00:00, 12255.95it/s]


# Narrative Network Construction

In [34]:
networks_dict = {}
for book in tqdm(events_df_dict.keys()):
    
    events_df = events_df_dict[book]
    character_data = characters_df_dict[book]

    character_data = character_data[character_data.role == "proper"]
    events_df = events_df[(events_df.agent_name.isin(character_data.name.unique()))&(events_df.patient_name.isin(character_data.name.unique()))]
    events_df = events_df[~(events_df.agent_name==events_df.patient_name)]

    edge_list = events_df[["agent_name", "patient_name", "lemma"]]
    
    if edge_list.shape[0] >= 10:
        G_multi = nx.from_pandas_edgelist(edge_list, source="agent_name", target="patient_name", edge_attr=["lemma"], create_using=nx.MultiGraph).to_undirected()
        edge_counts = Counter((min(u, v), max(u, v)) for u, v in G_multi.edges())
        G = nx.Graph()
        G.add_edges_from((u, v, {'weight': w}) for (u, v), w in edge_counts.items())
        Gcc = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)
        G0 = G.to_undirected().subgraph(Gcc[0])
        
        networks_dict[book] = G0

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10876/10876 [00:22<00:00, 486.28it/s]


# Network Metrics

In [37]:
network_metrics_dict = {}
for book in tqdm(networks_dict.keys()):
    G0 = networks_dict[book]
    if G0.number_of_nodes() > 2:
        network_metrics = get_network_metrics(networks_dict[book])
        network_metrics_dict[book] = network_metrics
    clear_output(wait=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10605/10605 [00:57<00:00, 184.01it/s]


In [38]:
network_metric_df = pd.DataFrame(network_metrics_dict).T.reset_index()
meta_df = meta_df.merge(network_metric_df.reset_index(), how="left", left_on="id", right_on="index")
meta_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [39]:
network_variables = ["n_nodes", "network_density", "modularity", "average_clustering", "average_shortest_path_largest_component"]
filter = ~meta_df[network_variables].isna().any(axis=1)
meta_df = meta_df[filter]

In [59]:
meta_df[["id", "title", "author", "authoryearofbirth", "authoryearofdeath", "language",
        "downloads", 'genre_war', 'genre_biography', 'genre_romance',
       'genre_drama', 'genre_fantasy', 'genre_family', 'genre_science',
       'genre_action', 'genre_thriller', 'genre_western', 'genre_horror',
       'genre_mystery', 'genre_crime', 'genre_history', 'genre_periodicals',
       'genre_christian', 'genre_other', 'N_tokens',
       'log_N_tokens', 'N_sentences', 'log_N_sentences', 
         'n_nodes', 'n_edges', 'network_density', 'average_clustering',
       'modularity', 'average_shortest_path_largest_component']].to_csv("../data/study1_novel/fiction_regression_df.csv", index=False)