# Semantic model

MOTIVATION: Extract meaningful patterns.
Learn how to use some of the most common libraries

In [2]:
from sklearn.datasets import fetch_20newsgroups

# We filter only some categories, otherwise we have 20 categories
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
# We remove metadata to avoid bias in the classification. 
# Get categories info
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'), 
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),
                                    categories=categories)


# Obtain a vector

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', min_df=10)

vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_train.shape

(2034, 2807)

We are going to use the gensim library instead of Scikit

In [6]:
from gensim import matutils

vocab = vectorizer.get_feature_names()

dictionary = dict([(i,s) for i, s in enumerate(vectorizer.get_feature_names())])
corpus_tfidf = matutils.Sparse2Corpus(vectors_train)


LDA implementation. It is possible to use scikit LDA implementation or gensim. 

In [7]:
from gensim.models.ldamodel import LdaModel

# Train lda model. 4 topics
lda = LdaModel(corpus_tfidf, num_topics=4, passes = 20, id2word=dictionary)

In [8]:
# Check topics
lda.print_topics(4)

[(0,
  '0.006*car + 0.006*england + 0.005*closely + 0.005*beginning + 0.005*groups + 0.005*center + 0.005*maintain + 0.005*evil + 0.005*archie + 0.005*dealing'),
 (1,
  '0.003*platform + 0.003*frank + 0.003*mechanism + 0.003*led + 0.003*kind + 0.003*month + 0.003*forces + 0.003*profit + 0.002*fellow + 0.002*contradict'),
 (2,
  '0.004*primary + 0.004*mar + 0.004*contains + 0.004*deleted + 0.004*geometry + 0.004*produced + 0.004*deep + 0.004*believe + 0.004*ok + 0.004*animal'),
 (3,
  '0.003*objects + 0.003*activity + 0.002*manhattan + 0.002*obtained + 0.002*eyes + 0.002*color + 0.002*netters + 0.002*complex + 0.002*missions + 0.002*education')]

Now we need to translate the corpus previously defined in scikit to LSI. So, create corpus with Gensim.

In [11]:
# 1. Import the gensim.corpora module to generate dictionary
from gensim import corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
import string

def preprocess(words):
    tokenizer = RegexpTokenizer('[A-Z]\w+')
    tokens = [w.lower() for w in tokenizer.tokenize(words)]
    stoplist = stopwords.words('english')
    tokens_stop = [w for w in tokens if w not in stoplist]
    punctuation = set(string.punctuation)
    tokens_clean = [w for w in tokens_stop if w not in punctuation]
    return tokens_clean

texts = [preprocess(document) for document in newsgroups_train.data]
dictionary = corpora.Dictionary(texts)

In [14]:
# Save dictionary
dictionary.save('newsgroups.dict')
print(dictionary)

Dictionary(10913 unique tokens: ['image32', 'epa', 'se400', 'easily', 'learning']...)


In [15]:
# Generate a list of docs where each doc is a list of words
docs = [preprocess(doc) for doc in newsgroups_train.data]

In [16]:
# Gensim corpora module to generate dictionary
from gensim import corpora
dictionary = corpora.Dictionary(docs)
print(dictionary)

Dictionary(10913 unique tokens: ['image32', 'epa', 'se400', 'easily', 'learning']...)


In [17]:
# Construct the corpus representing each document as a bag-of-words
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [19]:
from gensim.models import TfidfModel
# Calculate tfidfmodel
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]
print(corpus_tfidf[0])

[(0, 0.24093628445650234), (1, 0.1598114653031772), (2, 0.5700978153855775), (3, 0.10438175896914427), (4, 0.722808853369507), (5, 0.24093628445650234)]


In [35]:
from gensim.models.ldamodel import LdaModel

# train the lda model, choosing number of topics equal to 4, it takes a long time

lda_model = LdaModel(corpus_tfidf, num_topics=4, passes=20, id2word=dictionary)

In [36]:
# check the topics
lda_model.print_topics(4)

[(0,
  '0.008*god + 0.007*moon + 0.007*cheers + 0.007*kent + 0.006*lucky + 0.006*khomeini + 0.006*davidian + 0.005*bob + 0.005*phobos + 0.005*unfortunately'),
 (1,
  '0.009*baptist + 0.007*ns + 0.007*jeff + 0.007*mary + 0.006*crusades + 0.006*basically + 0.005*gerald + 0.005*bull + 0.005*really + 0.003*regardless'),
 (2,
  '0.007*koresh + 0.006*bible + 0.006*zoroastrians + 0.006*joy + 0.005*ssrt + 0.005*samaritan + 0.005*happy + 0.004*septuagint + 0.004*virtual + 0.004*reality'),
 (3,
  '0.011*targa + 0.010*whatever + 0.009*islam + 0.008*western + 0.008*thanks + 0.008*plane + 0.007*lot + 0.006*yayayay + 0.006*craig + 0.005*windows')]

In [37]:
# check the lsa vector for the first document
corpus_lda = lda_model[corpus_tfidf]
print(corpus_lda[0])

[(0, 0.7499544926225231), (1, 0.0829040434822071), (2, 0.08314424809633292), (3, 0.08399721579893679)]


In [39]:
#predict topics of a new doc
new_doc = "God is love and God is the Lord"
#transform into BOW space
bow_vector = dictionary.doc2bow(preprocess(new_doc))
print([(dictionary[id], count) for id, count in bow_vector])

[('lord', 1), ('god', 2)]


In [40]:
#transform into LDA space
lda_vector = lda_model[bow_vector]
print(lda_vector)

[(0, 0.8104287254521839), (1, 0.06352717134948714), (2, 0.06335332148949119), (3, 0.06269078170883781)]


In [41]:
# Print document's single most prominent LDA topic
print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

0.008*god + 0.007*moon + 0.007*cheers + 0.007*kent + 0.006*lucky + 0.006*khomeini + 0.006*davidian + 0.005*bob + 0.005*phobos + 0.005*unfortunately


In [42]:
lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]
print(lda_vector_tfidf)
# print the document's single most prominent LDA topic
print(lda_model.print_topic(max(lda_vector_tfidf, key=lambda item: item[1])[0]))

[(0, 0.684280595038636), (1, 0.10542650716029629), (2, 0.10592885927926182), (3, 0.10436403852180587)]
0.008*god + 0.007*moon + 0.007*cheers + 0.007*kent + 0.006*lucky + 0.006*khomeini + 0.006*davidian + 0.005*bob + 0.005*phobos + 0.005*unfortunately
