In [None]:
# https://realpython.com/sentiment-analysis-python/

In [3]:
import sys
!{sys.executable} -m pip install spacy



In [77]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [23]:
import spacy
text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""


# NLP constructor
nlp = spacy.load("en_core_web_sm")
print(nlp)

doc = nlp(text)

print(doc)

token_list = [token for token in doc]
# print(token_list)


<spacy.lang.en.English object at 0x137c35e80>

Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.



In [24]:
# Remove stop words
filtered_tokens = [token for token in doc if not token.is_stop]
print(filtered_tokens)

[
, Dave, watched, forest, burned, hill, ,, 
, miles, house, ., car, 
, hastily, packed, Marta, inside, trying, round, 
, pets, ., ", ?, ", wondered, 
, continued, wait, Marta, appear, pets, ., 
]


In [25]:
# Normalize the words
# Stemming and lemmatization (only lemmatization is provided by spaCy)
lemmas = [
    f"Token: {token}, lemma: {token.lemma_}"
    for token in filtered_tokens
]
lemmas

# Note the underscore returns the readable version of the lemma here



['Token: \n, lemma: \n',
 'Token: Dave, lemma: Dave',
 'Token: watched, lemma: watch',
 'Token: forest, lemma: forest',
 'Token: burned, lemma: burn',
 'Token: hill, lemma: hill',
 'Token: ,, lemma: ,',
 'Token: \n, lemma: \n',
 'Token: miles, lemma: mile',
 'Token: house, lemma: house',
 'Token: ., lemma: .',
 'Token: car, lemma: car',
 'Token: \n, lemma: \n',
 'Token: hastily, lemma: hastily',
 'Token: packed, lemma: pack',
 'Token: Marta, lemma: Marta',
 'Token: inside, lemma: inside',
 'Token: trying, lemma: try',
 'Token: round, lemma: round',
 'Token: \n, lemma: \n',
 'Token: pets, lemma: pet',
 'Token: ., lemma: .',
 'Token: ", lemma: "',
 'Token: ?, lemma: ?',
 'Token: ", lemma: "',
 'Token: wondered, lemma: wonder',
 'Token: \n, lemma: \n',
 'Token: continued, lemma: continue',
 'Token: wait, lemma: wait',
 'Token: Marta, lemma: Marta',
 'Token: appear, lemma: appear',
 'Token: pets, lemma: pet',
 'Token: ., lemma: .',
 'Token: \n, lemma: \n']

In [26]:
# Vectorize the text
filtered_tokens[1].vector
# Dense arrays have defined values in every space of the array

array([-1.0732636 , -1.5893133 , -0.7485422 ,  0.80338854,  0.199772  ,
        0.00840409,  1.5419112 ,  0.78789294, -0.10507858, -0.08379468,
        1.6370184 ,  0.9981552 , -0.27276078, -0.90784246, -1.248598  ,
       -0.5253062 ,  0.36606142,  0.3220521 ,  0.26947665, -0.6838576 ,
       -1.3466266 ,  0.01122165, -0.24088567, -0.48466757, -0.33174923,
       -0.05325297,  1.8773435 ,  0.5649502 , -0.9605744 ,  0.78610945,
       -0.44939822, -1.4648836 , -0.38066614,  1.0480766 , -0.83412176,
       -0.2217491 , -0.854434  ,  0.35594553, -0.11274697,  1.2787786 ,
       -0.8223142 ,  0.18473107, -0.08983883,  0.6325264 , -1.1029457 ,
        0.37194866,  0.11167954,  1.529881  ,  0.73127055, -0.01238401,
       -0.38741043,  0.24374214,  0.66934144, -0.5147386 , -0.05107623,
       -0.68364084,  1.2553529 , -0.4258138 ,  0.8257121 , -0.40289953,
       -1.0714421 ,  0.8215431 ,  0.1035445 , -0.5627636 ,  0.34108096,
       -0.46954772, -0.6444609 , -0.4248718 , -0.74732184, -0.93

In [28]:
# 1. Load the data
# 80% training data, 20% test data


def load_training_data(
    # Data directory
    data_directory: str = "aclImdb/train",
    split: float = 0.8,
    limit: int = 0
) -> tuple:
    # Create a list to store the list of tuples
    reviews = []
    # label will give the folder name in each of the two directories
    for label in ["pos", "neg"]:
        # Create the full directory path to the pos and neg folders
        labeled_directory = f"{data_directory}/{label}"
        # Create the full directory to each of the text files in the pos and negative folders
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}") as f:
                    # Clean the data
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        # Create a dictionary for the label of the .txt file (cats stands for category)
                        # The format must be a dictionary for spaCy to work
                        spacy_label = {"cats": {"pos": "pos" == label, "neg": "neg" == label}}
                        # Add the tuple back to the reviews list
                        reviews.append((text, spacy_label))
    random.shuffle(reviews)
    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    # First is the training set, second is the validation/test set
    return reviews[:split], reviews[split:]


In [79]:
# 2. Train the classifier
# After loading data, we need to add our cat labels to textcat
import os
import random
import spacy

# 3. Build the training loop
# Train only the textcat component

from spacy.util import minibatch, compounding


def train_model(training_data: list, test_data: list, iterations: int = 20) -> None:
    # Data pipeline
    nlp = spacy.load("en_core_web_sm")
    # Check if textcat is available
    
    if "textcat" not in nlp.pipe_names:
        # textcat = nlp.add_pipe("textcat", name = "textcat")
        # Add pipe to the end
        textcat = nlp.add_pipe("textcat", name = "textcat", last = True)

    else:
        textcat = nlp.get_pipe("textcat")
    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]

    # Disable pipelines other than textcat
    with nlp.disable_pipes(training_excluded_pipes):
        # Call the optimizer function
        # optimizer = nlp.begin_training()
        optimizer = nlp.create_optimizer()
        # Training loop
        print("Start of training:")
        print("Loss\tPrecision\tRecall\tF-score")
        # Creates a generator to create batch sizes
        batch_sizes = compounding(4.0, 32.0, 1.001)  # A generator that yields infinite series of input numbers
        
        for i in range(iterations):
                loss = {}
                # At each iteration, shuffle the data, get new batches
                random.shuffle(training_data)
                batches = minibatch(training_data, size=batch_sizes)
                # Run the SGD on each batch and update the model weights and biases
                for batch in batches:
                    text, labels = zip(*batch)
                    # Update the loss dictionary
                    # The drop hyperparamters tells the model what proportion of training data in the batch to skip over
                    nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)

                
                with textcat.model.use_params(optimizer.averages):
                    evaluation_results = evaluate_model(
                        tokenizer=nlp.tokenizer,
                        textcat=textcat,
                        test_data=test_data
                    )
                    print(
                        f"{loss['textcat']}\t{evaluation_results['precision']}"
                        f"\t{evaluation_results['recall']}"
                        f"\t{evaluation_results['f-score']}"
                    )

    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")

In [32]:
# Evaluate the progress of training
# True positive, true negative, false positive, false negative
# Calculate the precision (# of successful classification vs # of incorrect)
# Calculate the recall (# of sucessful classification / # of incorrect)
# F-score - metric for evaluating effectiveness of a binary classification model

def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Set to a low number
    true_negatives = 0
    false_negatives = 1e-8
    # 
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if (
                predicted_label == "neg"
            ):
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

In [39]:
TEST_REVIEW = """
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
"""

def test_model(input_data: str = TEST_REVIEW):
    #  Load saved trained model
    loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}"
    )


In [80]:
# Run train_model()
spacy.load('en_core_web_sm')
train, test = load_training_data(limit=2500)
train_model(train, test)
print("Testing model")
test_model()

Start of training:
Loss	Precision	Recall	F-score


ValueError: [E989] `nlp.update()` was called with two positional arguments. This may be due to a backwards-incompatible change to the format of the training data in spaCy 3.0 onwards. The 'update' function should now be called with a batch of Example objects, instead of `(text, annotation)` tuples. 

In [81]:
For after bug fixes:
https://stackoverflow.com/questions/66675261/how-can-i-work-with-example-for-nlp-update-problem-with-spacy3-0

SyntaxError: invalid syntax (1936713450.py, line 1)