### Gutenberg plot analysis - sentences

In [1]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Download NLTK data (run this once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/h6x/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/h6x/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/h6x/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Loading the data

In [3]:
# List of text files (replace with actual file paths)
base_path = "/Users/h6x/ORNL/git/learning/natural language processing/CS-524/project_1/data"
file_paths = [base_path + "/Great_short_stories_V1.txt", base_path + "/The_Memoirs_of_Sherlock_Holmes.txt", base_path + "/The_Return_of_Sherlock_Holmes.txt"]

In [4]:
book_contents=[]

# Read the contents of each book
for file_path in file_paths:
    with open(file_path, 'r') as file:
        book_contents.append(file.read())

#### Data Preprocessing

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package punkt to /Users/h6x/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/h6x/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/h6x/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/h6x/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/h6x/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import re

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [7]:
from nltk.tokenize import sent_tokenize

In [8]:
def clean_text_by_sentence(text):
    lemmatizer = WordNetLemmatizer()
    sub_pattern = r'[^A-Za-z]'
    stop_words = stopwords.words('english') + ['never','ever','couldnot','wouldnot','could','would','us',"i'm","you'd"]

    # Split the text into sentences
    sentences = sent_tokenize(text)

    # Clean and tokenize each sentence
    cleaned_sentences = []
    for sentence in sentences:
        # Lowercasing and removing special characters
        lower_sentence = sentence.lower()
        filtered_sentence = re.sub(sub_pattern, ' ', lower_sentence).lstrip().rstrip()
        
        # Tokenize the sentence into words
        words = word_tokenize(filtered_sentence)
        
        # Lemmatize and remove stopwords
        cleaned_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words if word not in stop_words]
        
        # Append the cleaned words as a sentence (list of words)
        if cleaned_words:  # Avoid empty sentences
            cleaned_sentences.append(cleaned_words)

    return cleaned_sentences

In [9]:
# Process each book and keep sentence context
cleaned_books_sentences = []
for book in book_contents:
    cleaned_books_sentences.extend(clean_text_by_sentence(book))  # List of list (sentences of tokens)

In [10]:
print(cleaned_books_sentences[:2])  # View the first two tokenized sentences

[['project', 'gutenberg', 'ebook', 'great', 'short', 'story', 'volume', 'ebook', 'use', 'anyone', 'anywhere', 'united', 'state', 'part', 'world', 'cost', 'almost', 'restriction', 'whatsoever'], ['may', 'copy', 'give', 'away', 'use', 'term', 'project', 'gutenberg', 'license', 'include', 'ebook', 'online', 'www', 'gutenberg', 'org']]


### Training the Word2Vec Model

In [11]:
import gensim
from gensim.models import Word2Vec

In [12]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [13]:
model.build_vocab(cleaned_books_sentences, progress_per=1000)

In [15]:
model.train(cleaned_books_sentences, total_examples=model.corpus_count, epochs=model.epochs)

(768897, 829875)

In [16]:
model.wv.most_similar("holmes")

[('mr', 0.9924426674842834),
 ('say', 0.9914420247077942),
 ('sir', 0.989795446395874),
 ('well', 0.9896977543830872),
 ('yes', 0.9864906668663025),
 ('think', 0.9860494136810303),
 ('thaddeus', 0.9854176044464111),
 ('good', 0.9849363565444946),
 ('watson', 0.9847761988639832),
 ('know', 0.9837348461151123)]

In [17]:
model.wv.most_similar("crime")

[('murder', 0.9998323321342468),
 ('arrest', 0.9997822046279907),
 ('circumstance', 0.9997783899307251),
 ('order', 0.9997750520706177),
 ('allow', 0.999769926071167),
 ('fail', 0.999769926071167),
 ('present', 0.9997572302818298),
 ('remarkable', 0.9997560381889343),
 ('become', 0.9997530579566956),
 ('prove', 0.9997473359107971)]

In [18]:
# Example: Get the vector for a word
word_vector = model.wv['holmes']
print(word_vector)

[-0.1330074   0.45747727  0.3000638  -0.00794652 -0.17580093 -1.095381
 -0.03232447  1.1645964  -0.5115212  -0.5494887   0.00717497 -0.51604515
  0.18336976  0.13551013  0.4211016  -0.5908753   0.1309373  -0.57215583
  0.16538335 -1.0475246   0.48544502 -0.275968   -0.07136368 -0.61670387
  0.04858413 -0.19747071 -0.59712255 -0.41507864 -0.8412448  -0.05670855
  0.42949915  0.3601233  -0.6326389  -0.33748505  0.2512424   0.5606835
  0.11237265 -0.19637552 -0.5466829  -1.1361508   0.17211936 -0.41779515
 -0.4294484   0.06821514  0.676661   -0.05890745 -0.61711365  0.26828718
  0.17942642  0.78138006  0.38279504 -0.31781104 -0.18795699 -0.36914557
 -0.05559334  0.42550448  0.25052357 -0.37134576 -0.22901465  0.31551746
 -0.44861773  0.00172861 -0.10546677 -0.5561483  -0.50396395  0.85637116
  0.18490978  0.38519016 -0.45354196  0.731912   -0.24359944  0.5991387
  0.47107294 -0.24170327  0.47730935  0.15380079  0.2860487   0.00132534
 -0.6446226   0.21150683 -0.39585948  0.24483876 -0.310

In [19]:
len(word_vector)

100

When training Word2Vec, the input size refers to the **number of words (tokens)** used during the training process. Word2Vec processes each word (token) and its context (neighboring words) within a specified "window size." Here's how it works:

### Word2Vec Input Structure:
- **Sentences**: Word2Vec expects a list of tokenized sentences as input, where each sentence is a list of words (tokens).
- **Window Size**: The `window` parameter defines the number of words before and after the target word to consider as context. For example, if the `window` size is 5, Word2Vec looks at the 5 words before and 5 words after the target word in each sentence.

### Example:
For a sentence like:
```python
["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
```
If the **window size** is 2, and the target word is "fox", the model will use the words `["quick", "brown", "jumps", "over"]` as its context for training (2 words before and 2 words after).

### Input Size During Training:
1. **Vocabulary Size**: Word2Vec creates a **vocabulary** from the entire corpus (all sentences combined). Each unique word gets assigned an index in this vocabulary.
   
2. **Training Data**: The input size is dynamic and depends on the total number of words in the corpus. Specifically, the **input size** for each iteration depends on:
   - The number of sentences
   - The number of words in each sentence
   - The specified `window` size

   Word2Vec loops through the words in the corpus, training the model on each word and its context. The input is not a fixed-size vector like in feed-forward neural networks but varies based on the sliding window for each word.

### Output:
- **Vector size**: This is controlled by the `vector_size` parameter. Each word in the vocabulary is mapped to a vector of length `vector_size`, which is the dimension of the embedding space (e.g., 100, 200).
  
### Example in Your Case:
If you have three novels, each tokenized into sentences and words, the input size is the total number of words across all sentences in all novels. Word2Vec will slide through each word, considering its surrounding context based on the `window` size.

For instance:
- Suppose you have a total of **50,000 words** in your combined novels.
- The Word2Vec model will process these 50,000 words, sliding a window across the context for each word and training accordingly.
  
The input size per iteration will depend on the window size and the specific sentence the model is processing at that time.

### Code Example (Train Size):
```python
# Get the total number of sentences and words for the input
total_sentences = len(cleaned_books_sentences)
total_words = sum(len(sentence) for sentence in cleaned_books_sentences)

print(f"Total Sentences: {total_sentences}")
print(f"Total Words: {total_words}")
```

This will give you an idea of the total number of sentences and words being processed during Word2Vec training.

Let's clarify this with a deeper look into **how Word2Vec processes the training data** and why the input size is dynamic.

### Dynamic Input Size in Word2Vec:
When we say the input size is dynamic, we are referring to how Word2Vec processes words and sentences during training. The number of words processed in each iteration depends on:

1. **The Number of Sentences**: Word2Vec treats each sentence as an individual sequence of words. The sentences can have varying lengths, which means different sentences will contribute different amounts of training data.
   
2. **The Number of Words in Each Sentence**: The size of each training iteration depends on the number of words in the sentence and the specified window size. Longer sentences provide more word-context pairs for training compared to shorter sentences.

### What Happens in Each Iteration:
Word2Vec goes through each sentence in your text corpus and processes it word by word. The input size in each training iteration depends on:

- **Target Word**: For each word in the sentence, the model tries to predict its surrounding context (defined by the window size).
  
- **Context Words**: The window size defines how many words around the target word are considered context. For example, if the window size is 2, Word2Vec looks at 2 words before and 2 words after the target word.

The model keeps sliding this window across the sentence, adjusting the context based on the position of the target word. For instance, at the start or end of a sentence, the context window may be smaller because there are fewer words.

### How Input Size Varies:
- **Long Sentences**: If a sentence is long, Word2Vec will generate more training pairs (target word + context words) because the window slides over more words. For example, a sentence with 10 words and a window size of 2 will generate more input pairs than a sentence with only 5 words.
  
- **Short Sentences**: Shorter sentences generate fewer training pairs because the model processes fewer words.

### Example: Input Size Based on Sentence Length

#### Sentence 1: "The quick brown fox jumps over the lazy dog"
- **Number of Words**: 9 words
- **Window Size**: 2 (looking at 2 words before and after the target word)

Training pairs for each target word:
- Target: "quick", Context: ["The", "brown"]
- Target: "brown", Context: ["quick", "fox"]
- Target: "fox", Context: ["brown", "jumps"]
- ... and so on

This sentence generates multiple (target, context) pairs.

#### Sentence 2: "The fox"
- **Number of Words**: 2 words
- **Window Size**: 2

Training pairs for each target word:
- Target: "fox", Context: ["The"]

This shorter sentence generates fewer (target, context) pairs.

### Corpus-Wide Input Size:
Across the entire text corpus:
- The **total number of training pairs** depends on the number of sentences and the number of words in each sentence.
- The **window size** determines how many context words are associated with each target word.
- Longer sentences create more pairs, and shorter sentences create fewer pairs.

### Visualizing Sentence-Dependent Input Size:
In each iteration, Word2Vec loops through one sentence and processes each word and its context. This means the amount of data being fed to the model varies depending on:
1. **The length of the current sentence**: Longer sentences have more words and more training pairs.
2. **The number of words in each window**: More words in a window mean more (target, context) pairs to process.

### Summary:
- **Dynamic Input Size**: In Word2Vec, the "input size" isn't a fixed matrix as in traditional neural networks, but rather the dynamic number of training pairs generated from each sentence in the corpus.
- **Sentence-Dependent Input Size**: If a sentence has 10 words, with a window size of 2, Word2Vec will generate more training pairs than if the sentence had only 3 words. The size of the input data dynamically scales with the sentence length.

The dynamic nature is due to Word2Vec processing sentences of varying lengths, where each word in the sentence becomes a target word, and its surrounding words become the context based on the sliding window.