### Gutenberg plot analysis - sentences

In [2]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
# Download NLTK data (run this once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/h6x/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/h6x/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/h6x/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Loading the data

In [4]:
# List of text files (replace with actual file paths)
base_path = "/Users/h6x/ORNL/git/learning/natural language processing/CS-524/project_1/data"
file_paths = [base_path + "/Great_short_stories_V1.txt", base_path + "/The_Memoirs_of_Sherlock_Holmes.txt", base_path + "/The_Return_of_Sherlock_Holmes.txt"]

In [5]:
book_contents=[]

# Read the contents of each book
for file_path in file_paths:
    with open(file_path, 'r') as file:
        book_contents.append(file.read())

#### Data Preprocessing

In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package punkt to /Users/h6x/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/h6x/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/h6x/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/h6x/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/h6x/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
import re

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [8]:
from nltk.tokenize import sent_tokenize

In [9]:
def clean_text_by_sentence(text):
    lemmatizer = WordNetLemmatizer()
    sub_pattern = r'[^A-Za-z]'
    stop_words = stopwords.words('english') + ['never','ever','couldnot','wouldnot','could','would','us',"i'm","you'd"]

    # Split the text into sentences
    sentences = sent_tokenize(text)

    # Clean and tokenize each sentence
    cleaned_sentences = []
    for sentence in sentences:
        # Lowercasing and removing special characters
        lower_sentence = sentence.lower()
        filtered_sentence = re.sub(sub_pattern, ' ', lower_sentence).lstrip().rstrip()
        
        # Tokenize the sentence into words
        words = word_tokenize(filtered_sentence)
        
        # Lemmatize and remove stopwords
        cleaned_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words if word not in stop_words]
        
        # Append the cleaned words as a sentence (list of words)
        if cleaned_words:  # Avoid empty sentences
            cleaned_sentences.append(cleaned_words)

    return cleaned_sentences

In [10]:
# Process each book and keep sentence context
cleaned_books_sentences = []
for book in book_contents:
    cleaned_books_sentences.extend(clean_text_by_sentence(book))  # List of list (sentences of tokens)

In [11]:
print(cleaned_books_sentences[:2])  # View the first two tokenized sentences

[['project', 'gutenberg', 'ebook', 'great', 'short', 'story', 'volume', 'ebook', 'use', 'anyone', 'anywhere', 'united', 'state', 'part', 'world', 'cost', 'almost', 'restriction', 'whatsoever'], ['may', 'copy', 'give', 'away', 'use', 'term', 'project', 'gutenberg', 'license', 'include', 'ebook', 'online', 'www', 'gutenberg', 'org']]


# **FastText**

In [12]:
from gensim.models import FastText
from gensim.utils import tokenize

In [13]:
#sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW

# Skip-gram (sg=1)
skipgram_model = FastText(sentences=cleaned_books_sentences, vector_size=10, window=2, min_count=1, sg=1, epochs=10)

In [14]:
skipgram_model.wv.most_similar("holmes")

[('sholto', 0.9685295820236206),
 ('inspector', 0.9681201577186584),
 ('sholtos', 0.9664484858512878),
 ('hopkins', 0.9616188406944275),
 ('lestrade', 0.9559245109558105),
 ('jones', 0.9535747170448303),
 ('hopkin', 0.9474747776985168),
 ('kindly', 0.9438467621803284),
 ('smile', 0.9409658312797546),
 ('holy', 0.9402598142623901)]

In [15]:
skipgram_model.wv.most_similar("crime")

[('crimea', 0.9964929819107056),
 ('incident', 0.9957688450813293),
 ('reasonable', 0.9947313666343689),
 ('consequence', 0.9944502711296082),
 ('possess', 0.9939798712730408),
 ('coincident', 0.9938412308692932),
 ('sequence', 0.9932464957237244),
 ('inconsequence', 0.9931868314743042),
 ('necessity', 0.9926545023918152),
 ('accident', 0.9923218488693237)]

In [16]:
# CBOW (sg=0)
cbow_model = FastText(sentences=cleaned_books_sentences, vector_size=10, window=2, min_count=1, sg=0, epochs=10)

In [17]:
cbow_model.wv.most_similar("holmes")

[('rolles', 0.9640038013458252),
 ('hasbrouck', 0.9440739750862122),
 ('soames', 0.9375590085983276),
 ('james', 0.9363596439361572),
 ('jones', 0.9279244542121887),
 ('sholto', 0.9277781248092651),
 ('hopkins', 0.9260837435722351),
 ('zabriskie', 0.9260257482528687),
 ('nimes', 0.9250425696372986),
 ('hudson', 0.9215695858001709)]

In [18]:
cbow_model.wv.most_similar("crime")

[('daintiest', 0.9979061484336853),
 ('test', 0.9976452589035034),
 ('recourse', 0.9972720146179199),
 ('course', 0.9963361620903015),
 ('pretty', 0.9963221549987793),
 ('divest', 0.9962096214294434),
 ('invest', 0.9962040781974792),
 ('unjust', 0.9961169958114624),
 ('necessary', 0.9958217144012451),
 ('est', 0.9956088662147522)]

In [19]:
# Get vector embedding for a word (e.g., 'Formula')
word_embedding_cbow = cbow_model.wv['crime']
print("Word embedding for 'crime' using CBOW:", word_embedding_cbow)

Word embedding for 'crime' using CBOW: [ 1.1936557   0.18804733 -0.9515808  -1.4164045   0.33243725  0.83363307
  1.8808808   1.4738642   0.44337526 -1.666153  ]


### Character and Event Representation Using Word Embeddings

In [21]:
import spacy

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

In [22]:
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["PERSON", "GPE", "EVENT"]]
    return entities

In [24]:
cleaned_books_text = [' '.join(sentences) for sentences in cleaned_books_sentences]

In [25]:
entities = [extract_entities(book) for book in cleaned_books_text]

In [27]:
entities

[[],
 [],
 [],
 [('william', 'PERSON'),
  ('al haines', 'PERSON'),
  ('robert louis stevenson', 'PERSON'),
  ('william', 'PERSON'),
  ('france', 'GPE'),
  ('america', 'GPE'),
  ('edgar allan', 'PERSON'),
  ('edgar allan', 'PERSON'),
  ('arthur conan', 'PERSON')],
 [],
 [],
 [],
 [('bald head', 'PERSON')],
 [],
 [('episode barrel', 'PERSON')],
 [],
 [],
 [],
 [],
 [('jonathan small', 'PERSON'),
  ('arthur conan', 'PERSON'),
  ('doyle', 'PERSON'),
  ('anna katharine', 'PERSON'),
  ('robert louis stevenson', 'PERSON'),
  ('prince florizel', 'GPE'),
  ('edgar allan', 'PERSON')],
 [],
 [],
 [],
 [],
 [('edgar allan', 'PERSON'), ('thomas browne mental', 'PERSON')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('manifold multiform', 'PERSON')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('paris', 'GPE')],
 [],
 [],
 [('paris', 'GPE')],
 [],
 [],
 [],
 [],
 [('paris', '

In [28]:
# Output persons and GPE entities separately
for i, (persons, gpe) in enumerate(entities):
    print(f"Book {i+1}:")
    print(f"Persons: {persons}")
    print(f"GPE: {gpe}")
    print()

ValueError: not enough values to unpack (expected 2, got 0)

In [29]:
def extract_entities(text):
    # Apply the spaCy NLP model
    doc = nlp(text)
    # Separate PERSON, GPE, and EVENT entities
    persons = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    gpe = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    events = [ent.text for ent in doc.ents if ent.label_ == "EVENT"]
    return persons, gpe, events  # Always return a tuple of (persons, gpe, events)

In [30]:

# Apply NER to each book's text and get persons, GPE, and events
entities = [extract_entities(book) for book in cleaned_books_text]

In [None]:
# Output persons, GPE, and events entities separately
for i, (persons, gpe, events) in enumerate(entities):
    print(f"Book {i+1}:")
    print(f"Persons: {persons}")
    print(f"GPE: {gpe}")
    print(f"Events: {events}")
    print()
