In [15]:
import nltk

# Download punkt tokenizer data
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import nltk

# Clear the cache and force a fresh download
nltk.data.path = ['/root/nltk_data']
nltk.download('punkt', download_dir='/root/nltk_data')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
import spacy

# Load a pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Hello! This is a NLP Class"

# Process the text
doc = nlp(text)

# Tokenize the text
words = [token.text for token in doc]

print("Tokenized Words:", words)


Tokenized Words: ['Hello', '!', 'This', 'is', 'a', 'NLP', 'Class']


In [18]:
# POS tagging
pos_tags = [(token.text, token.pos_) for token in doc]
print("POS Tags:", pos_tags)


POS Tags: [('Hello', 'INTJ'), ('!', 'PUNCT'), ('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('NLP', 'PROPN'), ('Class', 'NOUN')]


In [19]:
# Named Entity Recognition
entities = [(ent.text, ent.label_) for ent in doc.ents]
print("Named Entities:", entities)


Named Entities: [('NLP', 'ORG')]


In [20]:
# Lemmatization
lemmas = [(token.text, token.lemma_) for token in doc]
print("Lemmas:", lemmas)


Lemmas: [('Hello', 'hello'), ('!', '!'), ('This', 'this'), ('is', 'be'), ('a', 'a'), ('NLP', 'NLP'), ('Class', 'class')]


In [21]:
# Dependency Parsing
for token in doc:
    print(f"Token: {token.text}, Dep: {token.dep_}, Head: {token.head.text}")


Token: Hello, Dep: ROOT, Head: Hello
Token: !, Dep: punct, Head: Hello
Token: This, Dep: nsubj, Head: is
Token: is, Dep: ROOT, Head: is
Token: a, Dep: det, Head: Class
Token: NLP, Dep: compound, Head: Class
Token: Class, Dep: attr, Head: is


1. Tokenization with Sentence Segmentation
You can segment text into individual sentences as well, in addition to tokenizing words.

In [22]:
# Segment the text into sentences
sentences = list(doc.sents)

for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence.text}")


Sentence 1: Hello!
Sentence 2: This is a NLP Class


2. Visualizing Named Entities with displacy
spaCy has a built-in tool for visualizing Named Entities in a text. You can use the displacy module to create interactive visualizations.

In [23]:
import spacy
from spacy import displacy

# Load the model
nlp = spacy.load("en_core_web_sm")

# Process the text
text = "Apple is looking to buy a startup in the UK for $1 billion."
doc = nlp(text)

# Visualize Named Entities
displacy.render(doc, style='ent')


3. Vector Representation of Words (Word Vectors)
spaCy provides a way to get word vectors (similarity) for each token. You can find how similar two words are or calculate the vector representation of words.

In [24]:
# Check if the word has a vector (not all tokens may have vectors, like punctuation)
for token in doc:
    if token.has_vector:
        print(f"Word: {token.text}, Vector: {token.vector[:5]}...")  # Display the first 5 dimensions


Word: Apple, Vector: [-1.2436125  -1.1674542   0.17806128  0.34115627  0.67948616]...
Word: is, Vector: [-0.9121601  -0.36987227  0.091701    0.8001691  -0.51341116]...
Word: looking, Vector: [-1.107561    0.9264405   0.11748153  0.13880408 -1.169206  ]...
Word: to, Vector: [-0.689172    0.18811768  0.55091166  0.5293627  -0.5733002 ]...
Word: buy, Vector: [-1.7063262  -0.94292456 -0.12322037 -0.543365   -0.5348906 ]...
Word: a, Vector: [ 1.9245714  -0.18351659  0.47970247  1.4876909  -0.27767807]...
Word: startup, Vector: [-0.28209445 -0.47169942  0.3937447  -0.5265672   0.07165214]...
Word: in, Vector: [ 0.5501281  -0.21313888  0.4778055  -0.9788218  -0.22258288]...
Word: the, Vector: [ 1.6678028  -0.18202272 -0.96277213  1.5890981  -0.04702654]...
Word: UK, Vector: [-1.654748   -0.41367185 -0.6155603   0.4777996   0.21026194]...
Word: for, Vector: [ 0.28682965 -0.135075    0.9344232  -0.5485786  -0.5731973 ]...
Word: $, Vector: [-0.5239444  1.1179713  0.3720556  0.6658685  1.3779069


4. Cosine Similarity Between Two Texts
You can measure the similarity between two texts (or two documents) using spaCy's vector representations.

In [25]:
# Load another text
text2 = "Google is acquiring a new company in Europe."
doc2 = nlp(text2)

# Measure similarity between the two docs
similarity = doc.similarity(doc2)
print(f"Similarity between the texts: {similarity:.4f}")


Similarity between the texts: 0.5388


  similarity = doc.similarity(doc2)


5. Dependency Parsing - Visualize Syntax Tree
You can visualize the syntactic structure (dependency tree) of a sentence. This is useful for understanding the grammatical relationships between words.

In [26]:
# Visualize Dependency Parsing Tree
displacy.render(doc, style='dep', jupyter=True)


6. Text Classification (Using spaCy)
If you have labeled data, you can train a model for text classification. Here's a simple example of how to train a model:

In [28]:
import spacy
from spacy.training.example import Example

# Load the blank model
nlp = spacy.blank("en")

# Define the training data (format: (text, {'cats': {'label': True}}))
TRAINING_DATA = [
    ("I love programming", {'cats': {'positive': 1, 'negative': 0}}),
    ("I hate bugs", {'cats': {'positive': 0, 'negative': 1}}),
]

# Create the text classification pipeline
text_cat = nlp.create_pipe("textcat")
nlp.add_pipe(text_cat, last=True)
text_cat.add_label("positive")
text_cat.add_label("negative")

# Start training the model
optimizer = nlp.begin_training()

for epoch in range(10):
    print(f"Epoch {epoch+1}")
    for text, annotations in TRAINING_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example])

# Test the trained model
test_text = "I love coding!"
doc = nlp(test_text)
print(doc.cats)  # Output: {'positive': 0.999..., 'negative': 0.001...}

# Based on the output, the model predicts 'positive' sentiment.


ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <spacy.pipeline.textcat.TextCategorizer object at 0x7e71f2f43a10> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

7. Preprocessing Pipeline for Custom Text (Cleaning & Tokenizing)
Before applying NLP models, you often need to clean and preprocess the text data. Here's an example of preprocessing with spaCy:

In [29]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "The weather today is beautiful. It's a perfect day to go outside."

# Create a pipeline for preprocessing
def preprocess_text(text):
    # Process the text
    doc = nlp(text)

    # Remove stopwords and punctuation, then lemmatize
    cleaned_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    return " ".join(cleaned_tokens)

cleaned_text = preprocess_text(text)
print("Cleaned Text:", cleaned_text)


Cleaned Text: weather today beautiful perfect day outside


8. Text Summarization (Extractive)
spaCy doesn't have a built-in summarizer, but you can build a simple extractive summarizer by selecting the most important sentences. Here's a simple approach using sentence similarity:

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Sample text
text = """
    Natural Language Processing (NLP) is a subfield of artificial intelligence.
    It focuses on the interaction between computers and human language.
    NLP is used in a variety of applications such as chatbots, speech recognition, and sentiment analysis.
    Machine learning plays a crucial role in modern NLP techniques.
"""

# Split text into sentences
sentences = text.split("\n")

# TF-IDF Vectorization for sentence similarity
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)

# Compute the cosine similarity between sentences
cosine_sim = np.dot(X, X.T).toarray()

# Select the most similar sentences
summarized = []
for idx in range(len(sentences)):
    if np.any(cosine_sim[idx] > 0.5):  # Threshold for similarity
        summarized.append(sentences[idx])

summary = " ".join(summarized)
print("Extractive Summary:", summary)


Extractive Summary:     Natural Language Processing (NLP) is a subfield of artificial intelligence.     It focuses on the interaction between computers and human language.     NLP is used in a variety of applications such as chatbots, speech recognition, and sentiment analysis.     Machine learning plays a crucial role in modern NLP techniques.


9. Clustering Text Documents (Using spaCy + Scikit-learn)
You can use spaCy embeddings to cluster similar documents. Here's a simple example using KMeans clustering:

In [31]:
from sklearn.cluster import KMeans
import numpy as np

# Sample documents
documents = [
    "I love programming in Python.",
    "Python is a versatile language.",
    "The quick brown fox jumps over the lazy dog.",
    "I enjoy outdoor activities like hiking and running."
]

# Convert documents to vectors using spaCy embeddings
nlp = spacy.load("en_core_web_sm")
vectors = [nlp(doc).vector for doc in documents]

# KMeans Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(vectors)

# Output clusters
for idx, label in enumerate(kmeans.labels_):
    print(f"Document {idx+1}: Cluster {label}")


Document 1: Cluster 1
Document 2: Cluster 0
Document 3: Cluster 0
Document 4: Cluster 1
