In [None]:
# download the book from the Internet Archive
!wget https://archive.org/compress/OneHundredYearsOfSolitude_201710
!unzip -p OneHundredYearsOfSolitude_201710.zip One_Hundred_Years_of_Solitude_djvu.txt>One_Hundred_Years_of_Solitude_djvu.txt

In [None]:
import re

input_file_path = "One_Hundred_Years_of_Solitude_djvu.txt"
output_file_path = "One_Hundred_Years_of_Solitude.txt"

with open(input_file_path, "r") as file:
    lines = file.readlines()

# Remove title pages, acknowledgements, and other front matter
with open(output_file_path, "w") as file:
    file.writelines(lines[186:])

with open(output_file_path, "r", encoding="utf-8") as file:
    content = file.read()

# Remove page numbers and the header that appears at the beginning of each page
regex_page_header = r"(\n\n\n\d+\s)?\n\n\n\nGABRIEL GARCIA MARQUES x ONE HUNDRED YEARS OF SOLITUDE\s\n\n\n"
regex_line_breaks = r"\s\n(?=[^\n])"

content = re.sub(regex_page_header, "", content)
content = re.sub(regex_line_breaks, " ", content)

# Save to new file
with open(output_file_path, "w", encoding="utf-8") as file:
    file.write(content)

In [2]:
# Fixing what appear to be OCR errors
with open("One_Hundred_Years_of_Solitude.txt", "r") as file:
    text = file.read()
text = (
    text.replace("Buendfa", "Buendia")
    .replace("Te6filo", "Teofilo")
    .replace("Pmdencio", "Prudencio")
    .replace("Magnlfico", "Magnifico")
    .replace("Aureiiano", "Aureliano")
    .replace("Buendla", "Buendia")
    .replace("Bmno", "Bruno")
)
with open("One_Hundred_Years_of_Solitude_cleaned.txt", "w") as file:
    file.write(text)

In [3]:
# !python -m spacy download en_core_web_sm

In [2]:
# Extracting named entities from the text

import spacy
import numpy as np
import pandas as pd

# Load spaCy model and process text
nlp = spacy.load("en_core_web_sm")

with open("One_Hundred_Years_of_Solitude_cleaned.txt", "r", encoding="utf-8") as file:
    text = file.read()

doc = nlp(text)

# Extract named entities that are persons
person_entities = []
for ent in doc.ents:
    if ent.label_ == "PERSON" and ent.text[0].isupper():
        person_entities.append(ent.text.replace("’s", ""))

# Converting list of names into a DataFrame with frequency counts
entities_df = pd.DataFrame(person_entities, columns=["name"])
entity_counts_df = entities_df["name"].value_counts().reset_index()
entity_counts_df.columns = ["name", "count"]

# Sorting values descending by count
entity_counts_df.sort_values(by="count", ascending=False, inplace=True)

print(entity_counts_df)

                     name  count
0       Aureliano Buendia    194
1       Aureliano Segundo    187
2    Jose Arcadio Buendia    167
3            Jose Arcadio    132
4                  Rebeca     95
..                    ...    ...
98               Remedios      1
99                Coronel      1
100                   Ash      1
101                Mauser      1
139              Augustus      1

[140 rows x 2 columns]


In [3]:
entity_counts_df[entity_counts_df["count"] > 1]

Unnamed: 0,name,count
0,Aureliano Buendia,194
1,Aureliano Segundo,187
2,Jose Arcadio Buendia,167
3,Jose Arcadio,132
4,Rebeca,95
...,...,...
62,Amaranta Buendia,2
61,Aureliano Arcaya,2
60,Jack Brown,2
59,Alirio Noguera,2


In [4]:
# import nltk

# # Ensure NLTK resources are downloaded
# nltk.download("punkt")
# nltk.download("stopwords")

In [5]:
import pandas as pd
import nltk
from string import punctuation

import numpy as np

output_file_path = "One_Hundred_Years_of_Solitude.txt"

with open(output_file_path, "r", encoding="utf-8") as file:
    content = file.read()

entities = entity_counts_df[entity_counts_df["count"] > 1]
characters = entities["name"].tolist()


# Initialize a dictionary for character names and ids
character_dict = {"name": characters, "id": [f"id_{str(i)}" for i in range(len(characters))]}

character_ids = [id for id in character_dict["id"]]

# Replace character names in the text with their corresponding id
for index, character in enumerate(characters):
    content = content.replace(character, character_ids[index])

# Create the nodes DataFrame
nodes = pd.DataFrame.from_dict(character_dict)

# Initialize empty edges DataFrame
edges = pd.DataFrame(columns=["from", "to", "weight"])

# Prepare stop words
stop_words = nltk.corpus.stopwords.words("english")
stop_words.extend(punctuation)

# Tokenize the text content and remove stop words
tokenized_text = [word for word in nltk.word_tokenize(content) if word.lower() not in stop_words]

nodes.to_csv("nodes.csv", index=False)

# Sliding window for co-occurrence
window_size = 6
edge_rows = []

for index, row in nodes.iterrows():
    id = row["id"]
    for i, word in enumerate(tokenized_text):
        if word == id:
            for j in range(max(i - window_size // 2, 0), min(i + window_size // 2, len(tokenized_text))):
                if tokenized_text[j] in character_ids and tokenized_text[j] != word:
                    edge_rows.append({"from": int(id[3:]), "to": int(tokenized_text[j][3:]), "weight": 1})

# Populate edges DataFrame
if edge_rows:
    edges = pd.concat([edges, pd.DataFrame(edge_rows)], ignore_index=True)

edges = edges.groupby(["from", "to"]).size().reset_index(name="weight")

In [6]:
for col in edges.columns:
    edges[col] = pd.to_numeric(edges[col])

edges.dtypes

from      int64
to        int64
weight    int64
dtype: object

In [7]:
edges.to_csv("edges.csv", index=False)

In [8]:
import holoviews as hv

hv.extension("bokeh")

hv_nodes = hv.Dataset(nodes, "index")
hv_nodes.data.head(10)

Unnamed: 0,name,id
0,Aureliano Buendia,id_0
1,Aureliano Segundo,id_1
2,Jose Arcadio Buendia,id_2
3,Jose Arcadio,id_3
4,Rebeca,id_4
5,Aureliano,id_5
6,Jose Arcadio Segundo,id_6
7,Gerineldo Marquez,id_7
8,Pietro Crespi,id_8
9,Meme,id_9


In [9]:
edges.dtypes

from      int64
to        int64
weight    int64
dtype: object

In [10]:
edges.tail()

Unnamed: 0,from,to,weight
240,59,56,7
241,70,5,37
242,70,7,1
243,70,8,1
244,70,11,2


In [11]:
from holoviews import opts, dim

hv.output(size=250)

chord = hv.Chord((edges, hv_nodes))
chord.opts(
    opts.Chord(
        cmap="Category10",
        edge_cmap="Category10",
        edge_color=dim("from").str(),
        labels="name",
        node_color=dim("index").str(),
        bgcolor="black",
        label_text_color="white",
        title="Character Co-occurrence in One Hundred Years of Solitude",
        edge_line_width=dim("weight"),
        edge_alpha=0.7,
        label_text_alpha=0.9,
    )
)



In [12]:
renderer = hv.renderer("bokeh")
renderer.save(chord, "chord_diagram")

