In [1]:
!pip3 install nltk



In [4]:
# Import required libraries
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
import pandas as pd
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Sample text
text = "Natural Language Processing is a fascinating field. It combines linguistics and computer science!"

# Preprocess the text
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove punctuation and stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return cleaned_tokens

# Print cleaned tokens
cleaned_tokens = preprocess(text)
print(cleaned_tokens)

['natural', 'language', 'processing', 'fascinating', 'field', 'combines', 'linguistics', 'computer', 'science']


In [6]:
# Generate bigrams from cleaned tokens
bigrams = list(ngrams(cleaned_tokens, 2))
print("Bigrams:", bigrams)

Bigrams: [('natural', 'language'), ('language', 'processing'), ('processing', 'fascinating'), ('fascinating', 'field'), ('field', 'combines'), ('combines', 'linguistics'), ('linguistics', 'computer'), ('computer', 'science')]


In [7]:
# Example sentence
sentence = "Barack Obama was born in Hawaii and was elected president in 2008."
doc = nlp(sentence)
for ent in doc.ents:
    print(ent.text, ent.label_)

Barack Obama PERSON
Hawaii GPE
2008 DATE


In [8]:
sentences = [
    "I love machine learning.",
    "Natural language processing is a part of AI.",
    "AI is the future."
]

# CountVectorizer
count_vec = CountVectorizer()
X_count = count_vec.fit_transform(sentences)
print("Count Vectorizer Output:\n", X_count.toarray())

# TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(sentences)
print("\nTF-IDF Vectorizer Output:\n", X_tfidf.toarray())

Count Vectorizer Output:
 [[0 0 0 0 1 1 1 0 0 0 0 0]
 [1 0 1 1 0 0 0 1 1 1 1 0]
 [1 1 1 0 0 0 0 0 0 0 0 1]]

TF-IDF Vectorizer Output:
 [[0.         0.         0.         0.         0.57735027 0.57735027
  0.57735027 0.         0.         0.         0.         0.        ]
 [0.30650422 0.         0.30650422 0.40301621 0.         0.
  0.         0.40301621 0.40301621 0.40301621 0.40301621 0.        ]
 [0.42804604 0.5628291  0.42804604 0.         0.         0.
  0.         0.         0.         0.         0.         0.5628291 ]]


In [9]:
# Note: en_core_web_sm does not have word vectors. You can install and use en_core_web_md
# Uncomment below to install and load the medium model if needed.
# !python -m spacy download en_core_web_md
# nlp = spacy.load("en_core_web_md")

# Example word vector
word = nlp("machine")[0]
print("Vector for 'machine':\n", word.vector)

Vector for 'machine':
 [-0.7506128  -0.5764812   0.64351416  0.34771806  0.45008683 -0.31984514
  1.3374304   0.68238103 -0.24978791  0.01502722  0.20069125 -0.5300583
 -0.32142678  0.6083893   0.59112257  1.3969524  -1.3394686  -0.49667907
  0.931461    0.7621138  -0.63203806  1.1820784  -0.8377955   0.02632818
 -0.2938515   0.6069317   1.5544686  -0.04658534 -0.45219168  0.4812616
  0.02117313  0.9538503   0.38607854  0.03060587 -1.2614324  -0.7120025
 -0.05820665  0.99797565  0.3940748   0.03983051 -0.90997595 -0.3068042
  0.8167603   0.40132952 -0.65918976 -0.5283325  -0.10206974 -0.39648488
 -0.27466586 -0.5868283   0.11670434 -0.02715775  0.10342616 -0.7152366
  0.78196365  0.26182356  1.2007879   0.40819854 -0.8120166   0.10142353
 -0.9246426  -0.06107479 -0.28506374 -0.27212048 -0.06658113  0.21739638
 -0.34570396 -0.7191129  -0.6493218  -0.07578593 -0.26895642  0.2526233
  0.8506172   0.57301974 -0.10925089 -0.48964894 -0.33062595 -0.69049066
  0.5873908   0.5251721  -0.476669