In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd



In [2]:
# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Cats and dogs are great pets."
]

In [3]:
#initializing
vectorizer = TfidfVectorizer()

In [4]:
#fitting and transforming
tfidf_matrix = vectorizer.fit_transform(documents)

In [5]:
#converting to dense format and printing
dense_matrix = tfidf_matrix.todense()
print(dense_matrix)

[[0.         0.         0.42755362 0.         0.         0.
  0.         0.         0.42755362 0.32516555 0.         0.32516555
  0.6503311 ]
 [0.         0.         0.         0.         0.42755362 0.
  0.         0.42755362 0.         0.32516555 0.         0.32516555
  0.6503311 ]
 [0.40824829 0.40824829 0.         0.40824829 0.         0.40824829
  0.40824829 0.         0.         0.         0.40824829 0.
  0.        ]]


In [6]:
#getting the feature name
features_names = vectorizer.get_feature_names_out()
print(features_names)

['and' 'are' 'cat' 'cats' 'dog' 'dogs' 'great' 'log' 'mat' 'on' 'pets'
 'sat' 'the']


In [7]:
len(features_names)

13

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Cats and dogs are great pets."
]

# CountVectorizer
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(documents)
print("CountVectorizer Matrix:\n", count_matrix.toarray())
print("Feature Names:\n", count_vectorizer.get_feature_names_out())

# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("TfidfVectorizer Matrix:\n", tfidf_matrix.toarray())
features_names = tfidf_vectorizer.get_feature_names_out()
print(features_names)

CountVectorizer Matrix:
 [[0 0 1 0 0 0 0 0 1 1 0 1 2]
 [0 0 0 0 1 0 0 1 0 1 0 1 2]
 [1 1 0 1 0 1 1 0 0 0 1 0 0]]
Feature Names:
 ['and' 'are' 'cat' 'cats' 'dog' 'dogs' 'great' 'log' 'mat' 'on' 'pets'
 'sat' 'the']
TfidfVectorizer Matrix:
 [[0.         0.         0.42755362 0.         0.         0.
  0.         0.         0.42755362 0.32516555 0.         0.32516555
  0.6503311 ]
 [0.         0.         0.         0.         0.42755362 0.
  0.         0.42755362 0.         0.32516555 0.         0.32516555
  0.6503311 ]
 [0.40824829 0.40824829 0.         0.40824829 0.         0.40824829
  0.40824829 0.         0.         0.         0.40824829 0.
  0.        ]]
['and' 'are' 'cat' 'cats' 'dog' 'dogs' 'great' 'log' 'mat' 'on' 'pets'
 'sat' 'the']


### How can I instantiate a TfidfVectorizer with the following parameters:
- max_df = 0.95
- min_df = 2
- max_features = no_features
- stop_words = 'english'




In [9]:
max_df = 0.95
min_df = 2
max_features = None
stop_words = 'english'

vectorizer = TfidfVectorizer(
    max_df=max_df,
    min_df=min_df,
    max_features=max_features,
    stop_words=stop_words
)


Explanation of Parameters
- max_df = 0.95: Ignore terms that appear in more than 95% of the documents. This helps to remove extremely common words that are not informative.
- min_df = 2: Ignore terms that appear in fewer than 2 documents. This helps to remove very rare words that might not be useful for the analysis.
- max_features = None: This parameter can be set to a specific number to limit the number of features (terms) to be considered. Setting it to None means there is no limit on the number of features.
- stop_words = 'english': Use a built-in list of English stop words to exclude from the analysis. These are common words like "and", "the", etc., which typically do not carry meaningful information.

In [10]:
# Example usage with some documents
documents = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Cats and dogs are great pets."
]

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the matrix to a dense format and print
dense_matrix = tfidf_matrix.todense()
print(dense_matrix)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

[[1.]
 [1.]
 [0.]]
['sat']


### vectorizer objects methods

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Cats and dogs are great pets."
]

# Instantiate TfidfVectorizer with specific parameters
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the matrix to a dense format
dense_matrix = tfidf_matrix.todense()
print("TF-IDF Matrix:\n", dense_matrix)

# Get feature names
feature_names = vectorizer.get_feature_names_out()
print("Feature Names:\n", feature_names)

# Get stop words
stop_words = vectorizer.get_stop_words()
print("Stop Words:\n", stop_words)

# Get parameters
params = vectorizer.get_params()
print("Parameters:\n", params)

# Set new parameters
vectorizer.set_params(max_features=10)
print("Updated Parameters:\n", vectorizer.get_params())

# Transform new documents using the fitted vectorizer
new_documents = [
    "The cat and the dog sat on the log.",
    "Great pets are cats and dogs."
]
new_tfidf_matrix = vectorizer.transform(new_documents)
new_dense_matrix = new_tfidf_matrix.todense()
print("New TF-IDF Matrix:\n", new_dense_matrix)

# Inverse transform the matrix
inverse_transformed = vectorizer.inverse_transform(new_tfidf_matrix)
print("Inverse Transformed:\n", inverse_transformed)

TF-IDF Matrix:
 [[1.]
 [1.]
 [0.]]
Feature Names:
 ['sat']
Stop Words:
 frozenset({'fire', 'couldnt', 'nowhere', 'before', 'thus', 'together', 'must', 'latter', 'or', 'thereafter', 'among', 'indeed', 'elsewhere', 'sixty', 'still', 'no', 'third', 'upon', 'thereupon', 'whence', 'thick', 'your', 'six', 'had', 'whenever', 'do', 'get', 'why', 'my', 'somehow', 'fifteen', 'becomes', 'over', 'mill', 'at', 'they', 'more', 'for', 'few', 'which', 'interest', 'these', 'the', 'perhaps', 'therefore', 'will', 'have', 'amount', 'co', 'us', 'may', 'eleven', 'everything', 'neither', 'down', 'further', 'beyond', 'being', 'moreover', 'whereafter', 'yourself', 'i', 'per', 'can', 'hereby', 'be', 'nobody', 'nine', 'amongst', 'cry', 'nevertheless', 'somewhere', 'even', 'this', 'however', 'themselves', 'very', 'though', 'a', 'else', 'bottom', 'although', 'he', 'always', 'least', 'am', 'never', 'four', 'it', 'every', 'one', 'whose', 'afterwards', 'eg', 'something', 'often', 'others', 'any', 'myself', 'ours', 'w

## NMF

Non-negative Matrix Factorization (NMF) is a group of algorithms in linear algebra where a given matrix 
𝑉
V is factorized into (usually) two matrices 
𝑊
W and 
𝐻
H, with the property that all three matrices have no negative elements. This non-negativity makes the resulting matrices easier to inspect, making NMF useful for applications where interpretability is important.

Key Concepts and Definitions
Non-negative Matrix: A matrix in which all the elements are zero or positive.
Factorization: The process of decomposing a matrix into two or more matrices that, when multiplied together, approximate the original matrix.
Given a matrix 
𝑉
V of size 
𝑚
×
𝑛
m×n:

𝑉
≈
𝑊
𝐻
V≈WH

where:

𝑊
W is an 
𝑚
×
𝑟
m×r matrix (usually called the basis matrix),
𝐻
H is an 
𝑟
×
𝑛
r×n matrix (usually called the coefficient matrix),
𝑟
r is the rank of the factorization, and 
𝑟
r is chosen to be less than both 
𝑚
m and 
𝑛
n.

In [12]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Cats and dogs are great pets.",
    "Pets can be very friendly and loyal.",
    "Cats are more independent than dogs.",
    "Dogs are known to be loyal and protective."
]

# Convert documents to TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(documents)

#apply NF
num_topics = 2
nmf_model = NMF(n_components=num_topics, random_state=42)
W = nmf_model.fit_transform(tfidf_matrix)
H = nmf_model.components_

#displaying the topis as H matrix has the topics and W has the relationships between document and topics
feature_names = vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(H):
    print(f"Topic {topic_idx}")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-6:-1]]))

#displaying the document topic matrix
#print(W)

Topic 0
dogs cats pets loyal great
Topic 1
sat mat cat log dog


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Cats and dogs are great pets.",
    "Pets can be very friendly and loyal.",
    "Cats are more independent than dogs.",
    "Dogs are known to be loyal and protective."
]

# Convert documents to a matrix of token counts
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Apply LDA
num_topics = 2
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(X)

# Get the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda_model.components_):
    print(f"Topic {topic_idx}:")
    top_words_idx = topic.argsort()[:-6:-1]  # Get indices of top 5 words
    top_words = [feature_names[i] for i in top_words_idx]
    print(" ".join(top_words))


Topic 0:
dogs cats great known protective
Topic 1:
sat loyal pets mat cat


## Putting all this to work

In [56]:
def display_topics(model, feature_list):  
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        top_indices = topic.argsort()[:-6:-1]
        top_words = [feature_list[i] for i in top_indices]
        print(" ".join(top_words))

### NMF

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#import getch_20news groups from sklear.datasets
from sklearn.datasets import fetch_20newsgroups

#import NMF and LDA from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers','quotes'))

documents = dataset.data

In [58]:
no_features = 100
no_topics = 100

In [59]:
vectorizer = TfidfVectorizer(
    max_df = 0.95,
    min_df = 2,
    max_features = no_features,
    stop_words = 'english'
)

In [60]:
tfidf_matrix = vectorizer.fit_transform(documents)

In [61]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['00' '10' '12' '14' '15' '16' '20' '25' 'a86' 'available' 'ax' 'b8f'
 'believe' 'best' 'better' 'bit' 'case' 'com' 'come' 'course' 'data' 'day'
 'did' 'didn' 'different' 'does' 'doesn' 'don' 'drive' 'edu' 'fact' 'far'
 'file' 'g9v' 'god' 'going' 'good' 'got' 'government' 'help' 'information'
 'jesus' 'just' 'key' 'know' 'law' 'let' 'like' 'line' 'list' 'little'
 'll' 'long' 'look' 'lot' 'mail' 'make' 'max' 'mr' 'need' 'new' 'number'
 'people' 'point' 'power' 'probably' 'problem' 'program' 'question' 'read'
 'really' 'right' 'run' 'said' 'say' 'second' 'set' 'software' 'space'
 'state' 'sure' 'tell' 'thanks' 'thing' 'things' 'think' 'time' 'true'
 'try' 'use' 'used' 'using' 've' 'want' 'way' 'windows' 'work' 'world'
 'year' 'years']


In [62]:
nmf_model = NMF(n_components=no_topics,random_state=42)

In [63]:
nmf_model.fit_transform(tfidf_matrix)

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.34138105e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.15384918e-01],
       [1.16885598e-08, 0.00000000e+00, 2.47453911e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 8.05880091e-21, 4.92613667e-21],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.18602982e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [64]:
H = nmf_model.components_
print(H)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 5.63513917e-06]
 [5.87119927e-08 0.00000000e+00 7.70919213e-09 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 7.20330403e-10 ... 0.00000000e+00
  1.24898125e-15 0.00000000e+00]]


In [65]:
len(feature_names)

100

In [66]:
display_topics(nmf_model,feature_names)

Topic 0:
people know mr 14 different
Topic 1:
does know 14 set different
Topic 2:
know does read question didn
Topic 3:
edu 14 file mr set
Topic 4:
just a86 things don years
Topic 5:
like mr read different 25
Topic 6:
just years good doesn don
Topic 7:
use max set don read
Topic 8:
thanks max set read file
Topic 9:
good mr different read let
Topic 10:
think don set read question
Topic 11:
god things don mr jesus
Topic 12:
problem 14 file question didn
Topic 13:
windows read set different 25
Topic 14:
drive max different mr set
Topic 15:
time max ll 10 let
Topic 16:
don different ll case question
Topic 17:
ve question didn going doesn
Topic 18:
bit max 10 file let
Topic 19:
com max mr 25 file
Topic 20:
need max 10 file ll
Topic 21:
used available file question years
Topic 22:
year 14 10 ll new
Topic 23:
right 10 question years going
Topic 24:
key make 14 want space
Topic 25:
make 14 ll let question
Topic 26:
mail file available let ll
Topic 27:
way ll let years going
Topic 28:
things sa

### LDA

In [67]:
count_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=no_features,
    stop_words='english'
)

In [68]:
count_matrix = count_vectorizer.fit_transform(documents)

In [69]:
#getting the feature name
feature_names = count_vectorizer.get_feature_names_out()
print(feature_names)

['00' '10' '12' '14' '15' '16' '20' '25' 'a86' 'available' 'ax' 'b8f'
 'believe' 'best' 'better' 'bit' 'case' 'com' 'come' 'course' 'data' 'day'
 'did' 'didn' 'different' 'does' 'doesn' 'don' 'drive' 'edu' 'fact' 'far'
 'file' 'g9v' 'god' 'going' 'good' 'got' 'government' 'help' 'information'
 'jesus' 'just' 'key' 'know' 'law' 'let' 'like' 'line' 'list' 'little'
 'll' 'long' 'look' 'lot' 'mail' 'make' 'max' 'mr' 'need' 'new' 'number'
 'people' 'point' 'power' 'probably' 'problem' 'program' 'question' 'read'
 'really' 'right' 'run' 'said' 'say' 'second' 'set' 'software' 'space'
 'state' 'sure' 'tell' 'thanks' 'thing' 'things' 'think' 'time' 'true'
 'try' 'use' 'used' 'using' 've' 'want' 'way' 'windows' 'work' 'world'
 'year' 'years']


In [70]:
lda = LatentDirichletAllocation(
    n_components=no_topics,
    random_state=42
)

In [71]:
lda.fit_transform(count_matrix)

array([[0.001     , 0.001     , 0.001     , ..., 0.001     , 0.001     ,
        0.001     ],
       [0.00111111, 0.00111111, 0.00111111, ..., 0.00111111, 0.00111111,
        0.00111111],
       [0.00111111, 0.00111111, 0.00111111, ..., 0.00111111, 0.00111111,
        0.00111111],
       ...,
       [0.00333333, 0.00333333, 0.00333333, ..., 0.00333333, 0.00333333,
        0.00333333],
       [0.002     , 0.002     , 0.002     , ..., 0.002     , 0.19482313,
        0.002     ],
       [0.0004    , 0.0004    , 0.0004    , ..., 0.0004    , 0.04021966,
        0.0004    ]])

In [72]:
display_topics(lda,feature_names)

Topic 0:
jesus god know way said
Topic 1:
edu mail like just people
Topic 2:
data use way just like
Topic 3:
jesus people come does make
Topic 4:
long just way time like
Topic 5:
ax max g9v b8f 25
Topic 6:
line ll just look like
Topic 7:
jesus god say did don
Topic 8:
mr going know don think
Topic 9:
law fact does people way
Topic 10:
ax b8f a86 g9v max
Topic 11:
state don say better way
Topic 12:
10 20 15 14 25
Topic 13:
best better good way probably
Topic 14:
key use like using don
Topic 15:
world better 20 information new
Topic 16:
right just way people like
Topic 17:
different like good just ll
Topic 18:
think don just know good
Topic 19:
day people way going good
Topic 20:
just going say good way
Topic 21:
bit 16 better max way
Topic 22:
file windows just use 20
Topic 23:
little just better good don
Topic 24:
use way does used know
Topic 25:
case way better like think
Topic 26:
people just don say government
Topic 27:
00 new don help ve
Topic 28:
key bit number used information
To

### Gensim and LDA

In [73]:
from gensim import corpora
from gensim.models import LdaModel
from pprint import pprint

In [74]:
#documents from above exercise

In [75]:
# Tokenize documents
tokenized_docs = [doc.split() for doc in documents]

In [76]:
#creating a dictionary of documents
dictionary = corpora.Dictionary(tokenized_docs)

In [77]:
#converting tokenized docs into a document topic matrix
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

In [78]:
#traing LDA model
num_topics = 2
lda_model = LdaModel(corpus, num_topics=num_topics,id2word=dictionary,passes=10)

pprint(lda_model.print_topics())

[(0,
  '0.019*"X" + 0.008*"-" + '
  '0.008*"MAX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'AX>\'" '
  '+ 0.008*"1" + 0.005*"0" + 0.005*"2" + 0.005*"*" + 0.004*"for" + 0.003*"and" '
  '+ 0.003*"*/"'),
 (1,
  '0.050*"the" + 0.027*"to" + 0.024*"of" + 0.021*"a" + 0.020*"and" + '
  '0.015*"is" + 0.015*"in" + 0.015*"I" + 0.013*"that" + 0.009*"for"')]
