In [3]:
#! pip install scikit-learn
#! pip install nltk



In [4]:
import os
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

Data Import

Load the text corpus from a folder. These are three text documents

In [5]:
DIR = os.getcwd() + '/Speeches'
SpeechCorp = dict(docs = [open(os.path.join(DIR,f)).read() for f in os.listdir(DIR) if f.endswith(".txt")],
                  ColNames = [os.path.basename(p) for p in  glob.glob(DIR+"/*.txt")])

Then we can create the document-term matrix or term-document matrix using CountVectorizer() from scikit-learn. For details, you can go to https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html


In [6]:
vectorizer = CountVectorizer()
# Now X is the document-term matrix. 
x = vectorizer.fit_transform(SpeechCorp['docs'])

You can view the dataframe by calling the name, or if you want to view the first 10 rows of the dataframe using "dt.head(10)".


You can also sort the dataframe using "dt.sort_values(by=['Dream.txt', 'Gettysburg.txt'], ascending=False).head(10)"

In [7]:
# We can view the matrix by coverting it into dataframs
dt = pd.DataFrame(x.toarray().transpose(), index = vectorizer.get_feature_names(), columns = SpeechCorp['ColNames'])
dt

Unnamed: 0,Dream.txt,Gettysburg.txt
able,8,0
above,0,1
add,0,1
advanced,0,1
again,2,0
...,...,...
years,5,1
yes,1,0
york,2,0
you,8,0


In [8]:
# td is just a tramspose of dt
td = pd.DataFrame(x.toarray(), index = SpeechCorp['ColNames'], columns = vectorizer.get_feature_names())
td


Unnamed: 0,able,above,add,advanced,again,ago,ahead,alabama,all,alleghenies,...,work,world,would,wrongful,wrote,years,yes,york,you,your
Dream.txt,8,0,0,0,2,1,1,3,7,1,...,2,0,2,1,1,5,1,2,8,1
Gettysburg.txt,0,1,1,1,0,1,0,0,1,0,...,1,1,0,0,0,1,0,0,0,0


You can also remove the less frequent words from the matrix by controling these parameters

- min_df - the minimum document frequency allowed for a term in the document-term matrix.
- max_features - the maximum number of features allowed in the document-term matrix

In [9]:
vectorizer = CountVectorizer(min_df=2, max_features=50)
x = vectorizer.fit_transform(SpeechCorp['docs'])
dt1 = pd.DataFrame(x.toarray().transpose(), index = vectorizer.get_feature_names(), columns = SpeechCorp['ColNames'])
dt1

Unnamed: 0,Dream.txt,Gettysburg.txt
all,7,1
and,54,6
are,8,3
as,20,1
be,33,2
but,6,2
by,8,1
can,4,5
come,10,1
for,9,5


###### Text Preprocessing
Preprocessing is a critical first step. It involves several steps:

- Remove Punctuation (defult)
- Lowercase for all characters (defult)
- Remove White Spaces (optional)
- Remove Numbers (optional)
- Stemming
- Remove Stop Words
- Term Weighting

Note that remove punctuation and lower the documents are defult when using CountVectorizer of the scikit-learn library.

We will examine the rest of them one by one in this chapter.

The following commands will perform various text preprocedding using NLTK package. You can experiment all of them to observe the differences in the transformed corpus.

##### Stemming

In [10]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(analyzer=stemmed_words, lowercase = True) 
x = vectorizer.fit_transform(SpeechCorp['docs'])
dt = pd.DataFrame(x.toarray().transpose(), index = vectorizer.get_feature_names(), columns = SpeechCorp['ColNames'])
dt

Unnamed: 0,Dream.txt,Gettysburg.txt
abl,8,0
abov,0,1
add,0,1
advanc,0,1
again,2,0
...,...,...
year,5,1
yes,1,0
york,2,0
you,8,0


##### Remove Stop Words

In [11]:
vectorizer = CountVectorizer(stop_words='english', lowercase = True) 
x = vectorizer.fit_transform(SpeechCorp['docs'])
dt = pd.DataFrame(x.toarray().transpose(), index = vectorizer.get_feature_names(), columns = SpeechCorp['ColNames'])
dt

Unnamed: 0,Dream.txt,Gettysburg.txt
able,8,0
add,0,1
advanced,0,1
ago,1,1
ahead,1,0
...,...,...
wrongful,1,0
wrote,1,0
years,5,1
yes,1,0


##### Find Word Frequency

Counting word frequency is an important task in text analysis.

In [12]:
#Find frequent words and generate word and frequency list
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
x = vectorizer.fit_transform(SpeechCorp['docs'])
sum_words = x.sum(axis=0)
words_freq = [(word, sum_words[0,idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq[:20]

[('happy', 1),
 ('join', 3),
 ('today', 9),
 ('history', 2),
 ('greatest', 1),
 ('demonstration', 1),
 ('freedom', 21),
 ('nation', 16),
 ('score', 2),
 ('years', 6),
 ('ago', 2),
 ('great', 8),
 ('american', 4),
 ('symbolic', 1),
 ('shadow', 1),
 ('stand', 3),
 ('signed', 1),
 ('emancipation', 1),
 ('proclamation', 1),
 ('momentous', 1)]

In [13]:
# You can also sort the word list based on TF using:
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
words_freq[:20]

[('freedom', 21),
 ('nation', 16),
 ('negro', 15),
 ('let', 13),
 ('day', 12),
 ('ring', 12),
 ('come', 11),
 ('dream', 11),
 ('today', 9),
 ('great', 8),
 ('long', 8),
 ('men', 8),
 ('justice', 8),
 ('shall', 8),
 ('satisfied', 8),
 ('able', 8),
 ('new', 7),
 ('years', 6),
 ('white', 6),
 ('people', 6)]

##### Term Weighting
Term weighting is commonly used to assign weights to terms to reflect their importance to the meaning of the text.

Several commonly used term weighting strategies are term frequency (tf), inverse document frequency(idf), tfidf.

Details:

Term frequency  tfi,j
  counts the number of occurrences  ni,j
  of a term  ti
  in a document  dj
 . In the case of normalization, the term frequency  tfi,j
  is divided by  ∑knk,j
 .

Inverse document frequency for a term  ti
  is defined as  idfi=log2|D||{d|ti∈d|}|
  where  |D|
  denotes the total number of documents and where  |{d|ti∈d|}|
  is the number of documents where the term  ti
  appears.

Term frequency - inverse document frequency is now defined as  tfi,j∗idfi
 .


In [14]:
# You can get tfidf matrix by changing CountVectorizer to TfidfVectorizer from scikit-learn package
# For details you can go to https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
vectorizer = TfidfVectorizer(analyzer=stemmed_words, stop_words='english', lowercase=True)
x = vectorizer.fit_transform(SpeechCorp['docs'])
tfidf = pd.DataFrame(x.toarray().transpose(), index = vectorizer.get_feature_names(), columns = SpeechCorp['ColNames'])
tfidf.head(10)

Unnamed: 0,Dream.txt,Gettysburg.txt
abl,0.055236,0.0
abov,0.0,0.040812
add,0.0,0.040812
advanc,0.0,0.040812
again,0.013809,0.0
ago,0.004913,0.029038
ahead,0.006905,0.0
alabama,0.020714,0.0
all,0.034388,0.029038
allegheni,0.006905,0.0
