# Latent Semantic Analysis using Python

## Importing the Libraries

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Sample Data

In [2]:
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

dataset = [line.lower() for line in dataset]

# Creating Tfidf Model

In [3]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

# Visualizing the Tfidf Model

In [4]:
print(X[0])

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 17)	0.3211483974289089
  (0, 19)	0.2665807498646048
  (0, 26)	0.3211483974289089
  (0, 24)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 34)	0.2278643877752444


# Creating the SVD

In [5]:
lsa = TruncatedSVD(n_components = 4, n_iter = 100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
             random_state=None, tol=0.0)

# First Column of V

In [6]:
row1 = lsa.components_[3]

# Visualizing the concepts

In [7]:
terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    sortedTerms = sortedTerms[:10]
    print("\nConcept",i,":")
    for term in sortedTerms:
        print(term)


Concept 0 :
('the', 0.3760982952926372)
('concert', 0.34498873923306594)
('great', 0.30012402589487364)
('of', 0.2957980609526662)
('just', 0.23736582929791245)
('was', 0.23736582929791245)
('day', 0.22892159541504487)
('technology', 0.18383834567413374)
('all', 0.17824025175628944)
('in', 0.17824025175628944)

Concept 1 :
('to', 0.41578844396700637)
('cook', 0.28359165793510643)
('gordon', 0.28359165793510643)
('love', 0.28359165793510643)
('ramsay', 0.28359165793510643)
('see', 0.28359165793510643)
('and', 0.217306447112925)
('campaigns', 0.217306447112925)
('global', 0.217306447112925)
('have', 0.217306447112925)

Concept 2 :
('technology', 0.37791806767144)
('is', 0.34196143806319823)
('google', 0.3413969441909744)
('introducing', 0.3413969441909744)
('new', 0.3413969441909744)
('day', 0.14112432680994724)
('are', 0.11387892195373053)
('examples', 0.11387892195373053)
('present', 0.11387892195373053)
('robots', 0.11387892195373053)

Concept 3 :
('day', 0.46542676790411114)
('by', 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk

# Sample Data
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

dataset = [line.lower() for line in dataset]

# Creating Tfidf Model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

# Visualizing the Tfidf Model
print(X[0])


# Creating the SVD
lsa = TruncatedSVD(n_components = 4, n_iter = 100)
lsa.fit(X)


# First Column of V
row1 = lsa.components_[3]


# Word Concept Dictionary Creation
concept_words = {}

# Visualizing the concepts
terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    sortedTerms = sortedTerms[:10]
    concept_words["Concept "+str(i)] = sortedTerms
    

# Sentence Concepts
for key in concept_words.keys():
    sentence_scores = []
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[key]:
                if word == word_with_score[0]:
                    score += word_with_score[1]
        sentence_scores.append(score)
    print("\n"+key+":")
    for sentence_score in sentence_scores:
        print(sentence_score)

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 17)	0.3211483974289089
  (0, 19)	0.2665807498646048
  (0, 26)	0.3211483974289089
  (0, 24)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 34)	0.2278643877752444

Concept 0:
1.1297395470753948
1.4959427190164025
0
0.18383834567413407
0.7797604325216746
1.37336559899095
0

Concept 1:
0
0
1.8337467336425428
0
0
0
1.2850142324187055

Concept 2:
0.6242100916830993
0
0
1.7440703383075653
0.8334337554863548
0
0

Concept 3:
2.2015937554478886
0.12724213180694446
0
0.21264455202449842
0
0.2965820743887412
0
