In [20]:
!pip install nltk
!pip install scikit-learn




[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-win_amd64.whl (9.2 MB)
     ---------------------------------------- 9.2/9.2 MB 12.1 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.5.0
  Downloading scipy-1.11.1-cp310-cp310-win_amd64.whl (44.0 MB)
     --------------------------------------- 44.0/44.0 MB 31.2 MB/s eta 0:00:00
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.3.0 scipy-1.11.1 threadpoolctl-3.1.0



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Bag of Words and TF-IDF
Below, we'll look at three useful methods of vectorizing text.
- `CountVectorizer` - Bag of Words
- `TfidfTransformer` - TF-IDF values
- `TfidfVectorizer` - Bag of Words AND TF-IDF values

Let's first use an example from earlier and apply the text processing steps we saw in this lesson.

In [21]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\higor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\higor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\higor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
corpus = ["The first time you see The Second Renaissance it may look boring.",
        "Look at it at least twice and definitely watch part 2.",
        "It will change your view of the matrix.",
        "Are the human people the ones who started the war?",
        "Is AI a bad thing ?"]

In [23]:
stop_words = stopwords.words("english")
print(stop_words)
lemmatizer = WordNetLemmatizer()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Use the skills you learned so far to create a function `tokenize` that takes in a string of text and applies the following:
- case normalization (convert to all lowercase)
- punctuation removal
- tokenization, lemmatization, and stop word removal using `nltk`

Feel free to refer back to previous sections to complete these steps!

In [24]:
def tokenize(text):
    # normalize case and remove punctuation
    text = re.sub(r'[^a-zA-Z0-9]',' ',text).lower()
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [x for x in tokens if x not in stop_words]
    
    # Noun Lemmatization
    tokens = [lemmatizer.lemmatize(x) for x in tokens]
    
    # Verb Lemmatization
    tokens = [lemmatizer.lemmatize(x,pos='v') for x in tokens]

    return tokens

In [25]:
print(corpus)
for i,string in enumerate(corpus):
    tokens = tokenize(string)
    print(tokens)
    

['The first time you see The Second Renaissance it may look boring.', 'Look at it at least twice and definitely watch part 2.', 'It will change your view of the matrix.', 'Are the human people the ones who started the war?', 'Is AI a bad thing ?']
['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'bore']
['look', 'least', 'twice', 'definitely', 'watch', 'part', '2']
['change', 'view', 'matrix']
['human', 'people', 'one', 'start', 'war']
['ai', 'bad', 'thing']


# `CountVectorizer` (Bag of Words)

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

# initialize count vectorizer object
vect = CountVectorizer(tokenizer=tokenize)

In [29]:
# get counts of each token (word) in text data
X = vect.fit_transform(corpus)
print(X)

  (0, 6)	1
  (0, 20)	1
  (0, 17)	1
  (0, 16)	1
  (0, 15)	1
  (0, 11)	1
  (0, 9)	1
  (0, 3)	1
  (1, 9)	1
  (1, 8)	1
  (1, 21)	1
  (1, 5)	1
  (1, 24)	1
  (1, 13)	1
  (1, 0)	1
  (2, 4)	1
  (2, 22)	1
  (2, 10)	1
  (3, 7)	1
  (3, 14)	1
  (3, 12)	1
  (3, 18)	1
  (3, 23)	1
  (4, 1)	1
  (4, 2)	1
  (4, 19)	1


In [32]:
vect.get_feature_names_out()

array(['2', 'ai', 'bad', 'bore', 'change', 'definitely', 'first', 'human',
       'least', 'look', 'matrix', 'may', 'one', 'part', 'people',
       'renaissance', 'second', 'see', 'start', 'thing', 'time', 'twice',
       'view', 'war', 'watch'], dtype=object)

In [33]:
# convert sparse matrix to numpy array to view
X.toarray()

array([[0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0]], dtype=int64)

In [34]:
# view token vocabulary and counts
vect.vocabulary_

{'first': 6,
 'time': 20,
 'see': 17,
 'second': 16,
 'renaissance': 15,
 'may': 11,
 'look': 9,
 'bore': 3,
 'least': 8,
 'twice': 21,
 'definitely': 5,
 'watch': 24,
 'part': 13,
 '2': 0,
 'change': 4,
 'view': 22,
 'matrix': 10,
 'human': 7,
 'people': 14,
 'one': 12,
 'start': 18,
 'war': 23,
 'ai': 1,
 'bad': 2,
 'thing': 19}

# `TfidfTransformer`

In [35]:
from sklearn.feature_extraction.text import TfidfTransformer

# initialize tf-idf transformer object
transformer = TfidfTransformer(smooth_idf=False)

In [37]:
# use counts from count vectorizer results to compute tf-idf values
tfidf = transformer.fit_transform(X)
print(tfidf)

  (0, 20)	0.36419547310600114
  (0, 17)	0.36419547310600114
  (0, 16)	0.36419547310600114
  (0, 15)	0.36419547310600114
  (0, 11)	0.36419547310600114
  (0, 9)	0.2674539242255982
  (0, 6)	0.36419547310600114
  (0, 3)	0.36419547310600114
  (1, 24)	0.39105192975907627
  (1, 21)	0.39105192975907627
  (1, 13)	0.39105192975907627
  (1, 9)	0.28717647778015326
  (1, 8)	0.39105192975907627
  (1, 5)	0.39105192975907627
  (1, 0)	0.39105192975907627
  (2, 22)	0.5773502691896258
  (2, 10)	0.5773502691896258
  (2, 4)	0.5773502691896258
  (3, 23)	0.4472135954999579
  (3, 18)	0.4472135954999579
  (3, 14)	0.4472135954999579
  (3, 12)	0.4472135954999579
  (3, 7)	0.4472135954999579
  (4, 19)	0.5773502691896258
  (4, 2)	0.5773502691896258
  (4, 1)	0.5773502691896258


In [38]:
# convert sparse matrix to numpy array to view
tfidf.toarray()

array([[0.        , 0.        , 0.        , 0.36419547, 0.        ,
        0.        , 0.36419547, 0.        , 0.        , 0.26745392,
        0.        , 0.36419547, 0.        , 0.        , 0.        ,
        0.36419547, 0.36419547, 0.36419547, 0.        , 0.        ,
        0.36419547, 0.        , 0.        , 0.        , 0.        ],
       [0.39105193, 0.        , 0.        , 0.        , 0.        ,
        0.39105193, 0.        , 0.        , 0.39105193, 0.28717648,
        0.        , 0.        , 0.        , 0.39105193, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.39105193, 0.        , 0.        , 0.39105193],
       [0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.57735027, 0.

# `TfidfVectorizer`
`TfidfVectorizer` = `CountVectorizer` + `TfidfTransformer`

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize tf-idf vectorizer object
vectorizer = TfidfVectorizer(stop_words=stop_words)

In [42]:
# compute bag of word counts and tf-idf values
X = vectorizer.fit_transform(corpus)
for a,b in enumerate(vectorizer.vocabulary_.values()):
    print(a,b)

0 5
1 19
2 16
3 15
4 14
5 10
6 8
7 2
8 7
9 20
10 4
11 23
12 12
13 3
14 21
15 9
16 6
17 13
18 11
19 17
20 22
21 0
22 1
23 18


In [43]:
# convert sparse matrix to numpy array to view
X.toarray()

array([[0.        , 0.        , 0.36152912, 0.        , 0.        ,
        0.36152912, 0.        , 0.        , 0.29167942, 0.        ,
        0.36152912, 0.        , 0.        , 0.        , 0.36152912,
        0.36152912, 0.36152912, 0.        , 0.        , 0.36152912,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.42066906,
        0.        , 0.        , 0.42066906, 0.33939315, 0.        ,
        0.        , 0.        , 0.42066906, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.42066906, 0.        , 0.        , 0.42066906],
       [0.        , 0.        , 0.        , 0.57735027, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.        ],
       [0.   