<a href="https://colab.research.google.com/github/kenhuangsy/NLP/blob/main/Words_to_Vector_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
corpus = """
Quantum computing is a type of computing that uses quantum bits, or qubits, to store and process information. 
Qubits are units of information that can exist in multiple states simultaneously, unlike classical bits which can only exist in one state at a time. 
This allows quantum computers to perform certain types of calculations much faster than classical computers.
"""

In [4]:
corpus

'\nQuantum computing is a type of computing that uses quantum bits, or qubits, to store and process information. \nQubits are units of information that can exist in multiple states simultaneously, unlike classical bits which can only exist in one state at a time. \nThis allows quantum computers to perform certain types of calculations much faster than classical computers.\n'

In [6]:
import nltk
from nltk.stem import PorterStemmer # for stemming
from nltk.corpus import stopwords # for removing stopwords
from nltk.stem import WordNetLemmatizer # for lemmatization

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# Convert corpus into documents
documents = nltk.tokenize.sent_tokenize(corpus)
documents

['\nQuantum computing is a type of computing that uses quantum bits, or qubits, to store and process information.',
 'Qubits are units of information that can exist in multiple states simultaneously, unlike classical bits which can only exist in one state at a time.',
 'This allows quantum computers to perform certain types of calculations much faster than classical computers.']

In [15]:
# Stemming
stemmer = PorterStemmer()
stemmer.stem('historically')

'histor'

In [25]:
# Essential libraries for lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [26]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('historically')

'historically'

In [27]:
import re
corpus = []
for i in range(len(documents)):
  # Remove punctuation
  review = re.sub('[^a-zA-Z]', ' ', documents[i])
  review.lower()
  corpus.append(review)

In [28]:
corpus

[' Quantum computing is a type of computing that uses quantum bits  or qubits  to store and process information ',
 'Qubits are units of information that can exist in multiple states simultaneously  unlike classical bits which can only exist in one state at a time ',
 'This allows quantum computers to perform certain types of calculations much faster than classical computers ']

In [29]:
# Apply stemming
corpus_stemmed = []
for i in range(len(corpus)):
  review = corpus[i]
  review = review.split()
  review = [stemmer.stem(word) for word in review]
  review = ' '.join(review)
  corpus_stemmed.append(review)


In [30]:
corpus_stemmed

['quantum comput is a type of comput that use quantum bit or qubit to store and process inform',
 'qubit are unit of inform that can exist in multipl state simultan unlik classic bit which can onli exist in one state at a time',
 'thi allow quantum comput to perform certain type of calcul much faster than classic comput']

In [33]:
# Apply lemmatization
corpus_lemmatized = []
for i in range(len(corpus)):
  review = corpus[i]
  review = review.split()
  review = [lemmatizer.lemmatize(word) for word in review]
  review = ' '.join(review)
  corpus_lemmatized.append(review)


In [34]:
corpus_lemmatized

['Quantum computing is a type of computing that us quantum bit or qubits to store and process information',
 'Qubits are unit of information that can exist in multiple state simultaneously unlike classical bit which can only exist in one state at a time',
 'This allows quantum computer to perform certain type of calculation much faster than classical computer']

In [36]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [39]:
# Downloading stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [37]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
corpus_stopwords = []
for i in range(len(corpus)):
  review = corpus[i]
  review = review.split()
  review = [word for word in review if not word in stop_words]
  review = ' '.join(review)
  corpus_stopwords.append(review)



In [38]:
corpus_stopwords

['Quantum computing type computing uses quantum bits qubits store process information',
 'Qubits units information exist multiple states simultaneously unlike classical bits exist one state time',
 'This allows quantum computers perform certain types calculations much faster classical computers']

In [41]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [45]:
cv.vocabulary_

{'quantum': 24,
 'computing': 10,
 'is': 15,
 'type': 35,
 'of': 18,
 'that': 31,
 'uses': 39,
 'bits': 4,
 'or': 21,
 'qubits': 25,
 'to': 34,
 'store': 29,
 'and': 1,
 'process': 23,
 'information': 14,
 'are': 2,
 'units': 37,
 'can': 6,
 'exist': 11,
 'in': 13,
 'multiple': 17,
 'states': 28,
 'simultaneously': 26,
 'unlike': 38,
 'classical': 8,
 'which': 40,
 'only': 20,
 'one': 19,
 'state': 27,
 'at': 3,
 'time': 33,
 'this': 32,
 'allows': 0,
 'computers': 9,
 'perform': 22,
 'certain': 7,
 'types': 36,
 'calculations': 5,
 'much': 16,
 'faster': 12,
 'than': 30}

In [44]:
X = cv.fit_transform(corpus).toarray()
X

array([[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 1, 2, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 1, 1, 0, 2, 0, 1, 0, 0, 2, 0, 2, 1, 0, 0, 1, 1, 1, 1, 0,
        0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0]])

In [46]:
X[0]

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 2, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0])

In [47]:
# Binary Bag of Words
cv = CountVectorizer(binary = True)
X2 = cv.fit_transform(corpus).toarray()
X2

array([[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
        0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0]])