In [11]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

#### Create a list of documents

In [12]:
docs = [
"This is the first document.",
"This is the second document. But this is not the first document.",
"This is the third document. It is neither the first or the second document.",
"Is this the last document?"
]

#### Create a bag-of-words representation of these documents

In [13]:
# create a CountVectorizer instance
vectorizer = CountVectorizer()

# transform the docs into vectors of word frequency
X = vectorizer.fit_transform(docs)

# convert the dense matrix to a data frame
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())

# print the data frame
df_tf

Unnamed: 0,but,document,first,is,it,last,neither,not,or,second,the,third,this
0,0,1,1,1,0,0,0,0,0,0,1,0,1
1,1,2,1,2,0,0,0,1,0,1,2,0,2
2,0,2,1,2,1,0,1,0,1,1,3,1,1
3,0,1,0,1,0,1,0,0,0,0,1,0,1


In [14]:
# examine the mapping of words to feature indexes
vectorizer.vocabulary_

{'this': 12,
 'is': 3,
 'the': 10,
 'first': 2,
 'document': 1,
 'second': 9,
 'but': 0,
 'not': 7,
 'third': 11,
 'it': 4,
 'neither': 6,
 'or': 8,
 'last': 5}

In [15]:
# X by default is a sparse matrix
# the todense() method will convert X to a dense matrix
X.todense()

matrix([[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [1, 2, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 2],
        [0, 2, 1, 2, 1, 0, 1, 0, 1, 1, 3, 1, 1],
        [0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [16]:
# examine the parameters used by the CountVectorizer instance
vectorizer

For detailed usage, see http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

#### Use term presence instead of term frequency

In [17]:
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,but,document,first,is,it,last,neither,not,or,second,the,third,this
0,0,1,1,1,0,0,0,0,0,0,1,0,1
1,1,1,1,1,0,0,0,1,0,1,1,0,1
2,0,1,1,1,1,0,1,0,1,1,1,1,1
3,0,1,0,1,0,1,0,0,0,0,1,0,1


<b>TF-IDF</b><p>In Scikit-Learn, the actual formula used for tf-idf is tf * (idf + 1) = tf + tf * idf, instead of tf * idf. The effect of this is that terms with zero idf, i.e. that occur in all documents of a training set, will not be entirely ignored.</p>
<p>L1 normalization: TF is normalized using the sum of all TFs in a document</p>
<p>L2 normalization: TF is normalized using the root of the squared sum of all TFs in a document</p>

In [18]:
vectorizer = TfidfVectorizer(norm="l1")
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,but,document,first,is,it,last,neither,not,or,second,the,third,this
0,0.0,0.191456,0.234178,0.191456,0.0,0.0,0.0,0.0,0.0,0.0,0.191456,0.0,0.191456
1,0.131554,0.137301,0.083969,0.137301,0.0,0.0,0.0,0.131554,0.0,0.103719,0.137301,0.0,0.137301
2,0.0,0.108701,0.066478,0.108701,0.104151,0.0,0.104151,0.0,0.104151,0.082114,0.163051,0.104151,0.05435
3,0.0,0.169025,0.0,0.169025,0.0,0.323901,0.0,0.0,0.0,0.0,0.169025,0.0,0.169025


In [19]:
df_tf.apply(np.sum, axis=1)

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float64

The sum of TFs for each document is 1 under L1 normalization

In [20]:
vectorizer = TfidfVectorizer(norm=None)
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,but,document,first,is,it,last,neither,not,or,second,the,third,this
0,0.0,1.0,1.223144,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.916291,2.0,1.223144,2.0,0.0,0.0,0.0,1.916291,0.0,1.510826,2.0,0.0,2.0
2,0.0,2.0,1.223144,2.0,1.916291,0.0,1.916291,0.0,1.916291,1.510826,3.0,1.916291,1.0
3,0.0,1.0,0.0,1.0,0.0,1.916291,0.0,0.0,0.0,0.0,1.0,0.0,1.0


#### Including bigrams and trigrams

In [21]:
docs = [
"This is the first document.", # "This is", "is the", "the first", "first document", "This is the","is the first", "the first document"
"This is the second document. But this is not the first document.",
"This is the third document. It is neither the first or the second document.",
"Is this the last document?"
]

In [22]:
vectorizer = CountVectorizer(ngram_range=(1,3))
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,but,but this,but this is,document,document but,document but this,document it,document it is,first,first document,...,the third document,third,third document,third document it,this,this is,this is not,this is the,this the,this the last
0,0,0,0,1,0,0,0,0,1,1,...,0,0,0,0,1,1,0,1,0,0
1,1,1,1,2,1,1,0,0,1,1,...,0,0,0,0,2,2,1,1,0,0
2,0,0,0,2,0,0,1,1,1,0,...,1,1,1,1,1,1,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1


Now, there are 59 features.

In [33]:
vectorizer = CountVectorizer(ngram_range=(3,3))
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,but this is,document but this,document it is,first or the,is neither the,is not the,is the first,is the second,is the third,is this the,...,second document but,the first document,the first or,the last document,the second document,the third document,third document it,this is not,this is the,this the last
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,1,0,0,0,1,0,1,0,0,...,1,1,0,0,1,0,0,1,1,0
2,0,0,1,1,1,0,0,0,1,0,...,0,0,1,0,1,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1


#### Removing stopwords

### Removing stop words

In [30]:
vectorizer = CountVectorizer(ngram_range=(1,3), stop_words="english")
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,document,document document,document second,document second document,second,second document,second document document
0,1,0,0,0,0,0,0
1,2,1,0,0,1,1,1
2,2,0,1,1,1,1,0
3,1,0,0,0,0,0,0


In [31]:
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words="english")
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,document document,document second,second document
0,0,0,0
1,1,0,1
2,0,1,1
3,0,0,0


In [32]:
word_list=['is','but']
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words=word_list)
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,document it,document this,first document,first or,it neither,last document,neither the,not the,or the,second document,the first,the last,the second,the third,third document,this not,this the
0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,0,1,1,0,0,0,0,1,0,1,1,0,1,0,0,1,1
2,1,0,0,1,1,0,1,0,1,1,1,0,1,1,1,0,1
3,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1


The number of features has reduced greatly.

#### Using a fixed vocabulary

In [34]:
vocab = ["first", "second", "third", "fourth"]
vectorizer = CountVectorizer(vocabulary=vocab)
X = vectorizer.fit_transform(docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
df_tf

Unnamed: 0,first,second,third,fourth
0,1,0,0,0
1,1,1,0,0
2,1,1,1,0
3,0,0,0,0


POS Tagging

In [38]:
# download nltk data first
nltk.download()
"""
After executing this cell, a window will pop up (NLTK Downloader).
You can select to downloand all packages.
This will take a few minutes.
Then, close the window.
"""

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


KeyboardInterrupt: 

: 

In [None]:
stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\juane/nltk_data'
    - 'c:\\Users\\juane\\.pyenv\\pyenv-win\\versions\\3.10.0\\nltk_data'
    - 'c:\\Users\\juane\\.pyenv\\pyenv-win\\versions\\3.10.0\\share\\nltk_data'
    - 'c:\\Users\\juane\\.pyenv\\pyenv-win\\versions\\3.10.0\\lib\\nltk_data'
    - 'C:\\Users\\juane\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
# convert words to lower case
lower = map(str.lower, docs)

# replace punctuation with space
no_punc = map(lambda x: re.sub("[^a-z]", " ", x), lower)

# tokenize each document
tokenized = map(nltk.word_tokenize, no_punc)

# pos tag teach document
tagged = map(nltk.pos_tag, tokenized)

# remove stopwords
# stopwords can only be removed after POS tags are generated. Otherwise, it will influence the POS tagging results.
stopwords = nltk.corpus.stopwords.words("english")
def remove_stopwords(doc):
    out = []
    for word in doc:
        if word[0] not in stopwords: out.append(word)
    return out

no_stopwords = list(map(remove_stopwords, tagged))

no_stopwords

In [None]:
docs = [
"This is the first document.",
"This is the second document. But this is not the first document.",
"This is the third document. It is neither the first or the second document.",
"Is this the last document?"
]

Tag description: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [None]:
# convert the lists of tagged words to string so that scikit-learn can tokenize them
tagged_docs = list(map(str, no_stopwords))
tagged_docs

In [None]:
# vecotrize
vectorizer = CountVectorizer(token_pattern=r"\('[^ ]+', '[^ ]+'\)", lowercase=False)
X = vectorizer.fit_transform(tagged_docs)
df_tf = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
df_tf