# Contents

- [Regular Expressions](#reg_ex)
- [NLTK](#nltk)  
- [TextBlob](#txt_blb)  
- [Vectorization](#vec)  
    - [CountVectorizer](#cnt_vec)  
    - [TFIDF Vectorizer](#tf_vec)  
- [Topic Modeling](#top_mod)  
    - [LDA](#lda)  
    - [NMF](#nmf)  
- [word2vec](#w2v)  

# Imports

In [1]:
import pandas as pd

# Data

In [2]:
from nltk.corpus import movie_reviews

In [3]:
# grab range of filenames from corpus
fileids = movie_reviews.fileids('pos')[:50]
# create list of words for each file
doc_words = [movie_reviews.words(fileid) for fileid in fileids]
# create list of sentences by combining words back into sentences for each file
pos_docs = [' '.join(words) for words in doc_words]

In [4]:
fileids = movie_reviews.fileids('neg')[:50]
doc_words = [movie_reviews.words(fileid) for fileid in fileids]
neg_docs = [' '.join(words) for words in doc_words]

In [5]:
documents = pos_docs + neg_docs

# Regular Expressions <a name="reg_ex"></a>

In [77]:
import re

In [112]:
sample_text = 'purple alice-b@google.com, blah monkey32 bob@abc.com blah dishwasher'

In [114]:
match = re.search(r'\d', sample_text)
match.group()

'3'

In [115]:
match = re.search(r'\d+', sample_text)
match.group()

'32'

In [117]:
match = re.search(r'\S+\d+', sample_text)
match.group()

'monkey32'

In [107]:
match = re.search(r'\w+@\w+', sample_text)
match.group()

'b@google'

In [108]:
match = re.search(r'[\w.-]+@[\w.-]+', sample_text)
match.group()

'alice-b@google.com'

In [120]:
match = re.findall(r'[\w\.-]+@[\w\.-]+', sample_text)
match

['alice-b@google.com', 'bob@abc.com']

# NLTK <a name="nltk"></a>

In [6]:
import nltk

## Sentence Tokenization
Break document into list of sentences

In [7]:
from nltk import tokenize

In [8]:
text_sample = documents[0]
text_sample

'films adapted from comic books have had plenty of success , whether they \' re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there \' s never really been a comic book like from hell before . for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \' 80s with a 12 - part series called the watchmen . to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . in other words , don \' t dismiss this film because of its source . if you can get past the whole comic book thing , you might find another stumbling block in from hell \' s directors , albert and allen hughes . getting the hughes brothers to direct this seems

In [9]:
sentences = tokenize.sent_tokenize(text_sample)
sentences[0]

"films adapted from comic books have had plenty of success , whether they ' re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there ' s never really been a comic book like from hell before ."

## Word Tokenization
Break sentence into list of words

In [10]:
words = tokenize.word_tokenize(sentences[0])
words

['films',
 'adapted',
 'from',
 'comic',
 'books',
 'have',
 'had',
 'plenty',
 'of',
 'success',
 ',',
 'whether',
 'they',
 "'",
 're',
 'about',
 'superheroes',
 '(',
 'batman',
 ',',
 'superman',
 ',',
 'spawn',
 ')',
 ',',
 'or',
 'geared',
 'toward',
 'kids',
 '(',
 'casper',
 ')',
 'or',
 'the',
 'arthouse',
 'crowd',
 '(',
 'ghost',
 'world',
 ')',
 ',',
 'but',
 'there',
 "'",
 's',
 'never',
 'really',
 'been',
 'a',
 'comic',
 'book',
 'like',
 'from',
 'hell',
 'before',
 '.']

## Stopwords
Words like "the", "and", "of", etc.

In [11]:
from nltk.corpus import stopwords

In [12]:
# create list of words to remove from corpus
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']
stop = set(stop)

In [13]:
# remove stopwords
cln_words = [w for w in words if w not in stop]
cln_words

['films',
 'adapted',
 'comic',
 'books',
 'plenty',
 'success',
 'whether',
 'superheroes',
 'batman',
 'superman',
 'spawn',
 'geared',
 'toward',
 'kids',
 'casper',
 'arthouse',
 'crowd',
 'ghost',
 'world',
 'never',
 'really',
 'comic',
 'book',
 'like',
 'hell']

## N-grams
Adjacent words found in text

In [14]:
from nltk.util import ngrams

In [15]:
# create word pairs
bigrams = ngrams(words, 2)
bigrams

<generator object ngrams at 0x7f58fb9ed990>

In [16]:
for gram in bigrams:
    print(gram)

('films', 'adapted')
('adapted', 'from')
('from', 'comic')
('comic', 'books')
('books', 'have')
('have', 'had')
('had', 'plenty')
('plenty', 'of')
('of', 'success')
('success', ',')
(',', 'whether')
('whether', 'they')
('they', "'")
("'", 're')
('re', 'about')
('about', 'superheroes')
('superheroes', '(')
('(', 'batman')
('batman', ',')
(',', 'superman')
('superman', ',')
(',', 'spawn')
('spawn', ')')
(')', ',')
(',', 'or')
('or', 'geared')
('geared', 'toward')
('toward', 'kids')
('kids', '(')
('(', 'casper')
('casper', ')')
(')', 'or')
('or', 'the')
('the', 'arthouse')
('arthouse', 'crowd')
('crowd', '(')
('(', 'ghost')
('ghost', 'world')
('world', ')')
(')', ',')
(',', 'but')
('but', 'there')
('there', "'")
("'", 's')
('s', 'never')
('never', 'really')
('really', 'been')
('been', 'a')
('a', 'comic')
('comic', 'book')
('book', 'like')
('like', 'from')
('from', 'hell')
('hell', 'before')
('before', '.')


In [17]:
trigrams = ngrams(words, 3)

In [18]:
for gram in trigrams:
    print(gram)

('films', 'adapted', 'from')
('adapted', 'from', 'comic')
('from', 'comic', 'books')
('comic', 'books', 'have')
('books', 'have', 'had')
('have', 'had', 'plenty')
('had', 'plenty', 'of')
('plenty', 'of', 'success')
('of', 'success', ',')
('success', ',', 'whether')
(',', 'whether', 'they')
('whether', 'they', "'")
('they', "'", 're')
("'", 're', 'about')
('re', 'about', 'superheroes')
('about', 'superheroes', '(')
('superheroes', '(', 'batman')
('(', 'batman', ',')
('batman', ',', 'superman')
(',', 'superman', ',')
('superman', ',', 'spawn')
(',', 'spawn', ')')
('spawn', ')', ',')
(')', ',', 'or')
(',', 'or', 'geared')
('or', 'geared', 'toward')
('geared', 'toward', 'kids')
('toward', 'kids', '(')
('kids', '(', 'casper')
('(', 'casper', ')')
('casper', ')', 'or')
(')', 'or', 'the')
('or', 'the', 'arthouse')
('the', 'arthouse', 'crowd')
('arthouse', 'crowd', '(')
('crowd', '(', 'ghost')
('(', 'ghost', 'world')
('ghost', 'world', ')')
('world', ')', ',')
(')', ',', 'but')
(',', 'but', 't

## Stemming
Returns root of words

In [19]:
# initialize stemmer
stemmer = nltk.stem.porter.PorterStemmer()

In [20]:
# stem each word in words list
for word in words:
    print(stemmer.stem(word))

film
adapt
from
comic
book
have
had
plenti
of
success
,
whether
they
'
re
about
superhero
(
batman
,
superman
,
spawn
)
,
or
gear
toward
kid
(
casper
)
or
the
arthous
crowd
(
ghost
world
)
,
but
there
'
s
never
realli
been
a
comic
book
like
from
hell
befor
.


## Lemmatization
Similar to stemming but less aggressive

In [21]:
# initialize lemma
lemma=nltk.stem.WordNetLemmatizer()

In [22]:
# lemmatize each word in words list
for word in words:
    print(lemma.lemmatize(word))

film
adapted
from
comic
book
have
had
plenty
of
success
,
whether
they
'
re
about
superheroes
(
batman
,
superman
,
spawn
)
,
or
geared
toward
kid
(
casper
)
or
the
arthouse
crowd
(
ghost
world
)
,
but
there
'
s
never
really
been
a
comic
book
like
from
hell
before
.


# TextBlob <a name="txt_blb"></a>

In [23]:
from textblob import TextBlob

In [24]:
# break input into sentences
TextBlob(text_sample).sentences

[Sentence("films adapted from comic books have had plenty of success , whether they ' re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there ' s never really been a comic book like from hell before ."),
 Sentence("for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid ' 80s with a 12 - part series called the watchmen ."),
 Sentence("to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd ."),
 Sentence("the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes ."),
 Sentence("in other words , don ' t dismiss this film because of its source ."),
 Sentence("if you can get past the whole comic book thing , you might find another stumbling block in from hell ' s directors ,

In [25]:
# break input into words
TextBlob(sentences[0]).words

WordList(['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', 'whether', 'they', 're', 'about', 'superheroes', 'batman', 'superman', 'spawn', 'or', 'geared', 'toward', 'kids', 'casper', 'or', 'the', 'arthouse', 'crowd', 'ghost', 'world', 'but', 'there', 's', 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before'])

In [26]:
# retrieve word counts
TextBlob(sentences[0]).word_counts

defaultdict(int,
            {'films': 1,
             'adapted': 1,
             'from': 2,
             'comic': 2,
             'books': 1,
             'have': 1,
             'had': 1,
             'plenty': 1,
             'of': 1,
             'success': 1,
             'whether': 1,
             'they': 1,
             're': 1,
             'about': 1,
             'superheroes': 1,
             'batman': 1,
             'superman': 1,
             'spawn': 1,
             'or': 2,
             'geared': 1,
             'toward': 1,
             'kids': 1,
             'casper': 1,
             'the': 1,
             'arthouse': 1,
             'crowd': 1,
             'ghost': 1,
             'world': 1,
             'but': 1,
             'there': 1,
             's': 1,
             'never': 1,
             'really': 1,
             'been': 1,
             'a': 1,
             'book': 1,
             'like': 1,
             'hell': 1,
             'before': 1})

In [27]:
# positive or negative polarity for document
TextBlob(text_sample).sentiment

Sentiment(polarity=0.02095457285330703, subjectivity=0.47927694668201004)

In [28]:
# positive or negative polarity for sentence
TextBlob(sentences[0]).sentiment

Sentiment(polarity=0.17500000000000002, subjectivity=0.3)

# Vectorization <a name="vec"></a>
Bag of Words (BOW)

## CountVectorizer <a name="cnt_vec"></a>
Returns word counts per document

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
# create instance of CountVectorizer
vectorizer = CountVectorizer(stop_words='english', # remove inconsequential words
                             max_df=0.95, # ignore most frequent words (corpus-specific stopwords)
                             min_df=0.05 # ignore least frequent words (irrelevant words)
                            )

In [31]:
# learn vocabulary from input documents
vectorizer.fit(documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.05,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [32]:
# transform documents into document-term matrix
X = vectorizer.transform(documents)
X

In [33]:
# return terms from documents
cv_terms = vectorizer.get_feature_names()
cv_terms

['10',
 '13',
 '1999',
 '20',
 '30',
 'ability',
 'able',
 'absolutely',
 'accent',
 'act',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adaptation',
 'add',
 'addition',
 'adds',
 'adult',
 'adults',
 'african',
 'age',
 'agent',
 'ago',
 'al',
 'alan',
 'alien',
 'allen',
 'allowing',
 'amazing',
 'america',
 'american',
 'amusing',
 'angry',
 'anthony',
 'antics',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'approach',
 'appropriate',
 'appropriately',
 'aren',
 'army',
 'art',
 'artist',
 'ask',
 'asked',
 'asks',
 'assured',
 'atmosphere',
 'attempt',
 'attempts',
 'attention',
 'audience',
 'author',
 'award',
 'away',
 'background',
 'bad',
 'band',
 'based',
 'basically',
 'batman',
 'battle',
 'beat',
 'beats',
 'beautiful',
 'begin',
 'beginning',
 'begins',
 'believable',
 'believe',
 'ben',
 'best',
 'better',
 'big',
 'biggest',
 'billy',
 'bit',
 'bizarre',
 'black',
 'bland',
 'block',
 'blood',
 'blowing',


In [34]:
# return matrix of term counts per document
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [35]:
# create dataframe of document-term matrix
cv_df = pd.DataFrame(X.toarray(), columns=[cv_terms])
cv_df.head()

Unnamed: 0,10,13,1999,20,30,ability,able,absolutely,accent,act,...,writer,writing,written,wrong,yeah,year,years,york,young,younger
0,0,0,0,0,1,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,2,0,0,0,0
3,0,0,0,0,0,0,0,0,0,5,...,0,0,0,0,0,0,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Add lemmatizer

In [36]:
import nltk

In [37]:
lemma=nltk.stem.WordNetLemmatizer()
# lemmatize function
def lemma_func(document):
    return [lemma.lemmatize(word) for word in tokenize.word_tokenize(document)]

In [38]:
vectorizer = CountVectorizer(stop_words='english',
                             max_df=0.95,
                             min_df=0.05,
                             tokenizer=lemma_func
                            )

In [39]:
X_lem = vectorizer.fit_transform(documents)

In [40]:
cv_df_lem = pd.DataFrame(X_lem.toarray(), columns=[vectorizer.get_feature_names()])
cv_df_lem.head()

Unnamed: 0,!,$,&,*,+,--,/,1,10,13,...,write,writer,writing,written,wrong,yeah,year,york,young,younger
0,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,0,0,0,0,0,2,0,1,0,0,...,0,0,0,0,0,0,1,1,1,0
4,1,0,0,0,1,3,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## TFIDF Vectorizer <a name="tf_vec"></a>
Returns weights of words per document normalized against occurence in entire corpus

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
# create instance of TFIDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', # remove inconsequential words
                             max_df=0.95, # ignore most frequent words (corpus-specific stopwords)
                             min_df=0.05 # ignore least frequent words (irrelevant words)
                            )

In [43]:
# learn vocabulary from input documents
vectorizer.fit(documents)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.05,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [44]:
# transform documents into document-term matrix
X = vectorizer.transform(documents)
X

In [45]:
# return terms from documents
tf_terms = vectorizer.get_feature_names()
tf_terms

['10',
 '13',
 '1999',
 '20',
 '30',
 'ability',
 'able',
 'absolutely',
 'accent',
 'act',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adaptation',
 'add',
 'addition',
 'adds',
 'adult',
 'adults',
 'african',
 'age',
 'agent',
 'ago',
 'al',
 'alan',
 'alien',
 'allen',
 'allowing',
 'amazing',
 'america',
 'american',
 'amusing',
 'angry',
 'anthony',
 'antics',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'approach',
 'appropriate',
 'appropriately',
 'aren',
 'army',
 'art',
 'artist',
 'ask',
 'asked',
 'asks',
 'assured',
 'atmosphere',
 'attempt',
 'attempts',
 'attention',
 'audience',
 'author',
 'award',
 'away',
 'background',
 'bad',
 'band',
 'based',
 'basically',
 'batman',
 'battle',
 'beat',
 'beats',
 'beautiful',
 'begin',
 'beginning',
 'begins',
 'believable',
 'believe',
 'ben',
 'best',
 'better',
 'big',
 'biggest',
 'billy',
 'bit',
 'bizarre',
 'black',
 'bland',
 'block',
 'blood',
 'blowing',


In [46]:
# return matrix of normalized term counts
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.08754269, ..., 0.        , 0.05128158,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.10143881, 0.06166595,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.04563867,
        0.        ]])

In [47]:
# create dataframe of document-term matrix
tf_df = pd.DataFrame(X.toarray(), columns=[tf_terms])
tf_df.head()

Unnamed: 0,10,13,1999,20,30,ability,able,absolutely,accent,act,...,writer,writing,written,wrong,yeah,year,years,york,young,younger
0,0.0,0.0,0.0,0.0,0.075444,0.0,0.0,0.0,0.150887,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.041974,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095799,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.119973,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2927,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0385,0.060557,0.036814,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062963,0.0,0.0


### Add lemmatizer

In [48]:
import nltk

In [49]:
lemma=nltk.stem.WordNetLemmatizer()
# lemmatize function
def lemma_func(document):
    return [lemma.lemmatize(word) for word in tokenize.word_tokenize(document)]

In [50]:
vectorizer = TfidfVectorizer(stop_words='english',
                             max_df=0.95,
                             min_df=0.05,
                             tokenizer=lemma_func
                            )

In [51]:
X_lem = vectorizer.fit_transform(documents)

In [52]:
tf_df_lem = pd.DataFrame(X_lem.toarray(), columns=[vectorizer.get_feature_names()])
tf_df_lem.head()

Unnamed: 0,!,$,&,*,+,--,/,1,10,13,...,write,writer,writing,written,wrong,yeah,year,york,young,younger
0,0.0,0.0,0.0,0.0,0.0,0.0,0.083274,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.041326,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.032651,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.058183,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09194,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.058141,0.0,0.045453,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.023646,0.048515,0.029493,0.0
4,0.03192,0.0,0.0,0.0,0.058591,0.105325,0.036144,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058591,0.0,0.0


# Topic Modeling <a name="top_mod"></a>

## LDA <a name="lda"></a>

In [53]:
from sklearn.decomposition import LatentDirichletAllocation

In [54]:
# create instance of model, input number of topics to output
lda = LatentDirichletAllocation(n_components=10)

In [55]:
# fit model to vectorized data
lda.fit(cv_df)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [56]:
cv_terms

['10',
 '13',
 '1999',
 '20',
 '30',
 'ability',
 'able',
 'absolutely',
 'accent',
 'act',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adaptation',
 'add',
 'addition',
 'adds',
 'adult',
 'adults',
 'african',
 'age',
 'agent',
 'ago',
 'al',
 'alan',
 'alien',
 'allen',
 'allowing',
 'amazing',
 'america',
 'american',
 'amusing',
 'angry',
 'anthony',
 'antics',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'approach',
 'appropriate',
 'appropriately',
 'aren',
 'army',
 'art',
 'artist',
 'ask',
 'asked',
 'asks',
 'assured',
 'atmosphere',
 'attempt',
 'attempts',
 'attention',
 'audience',
 'author',
 'award',
 'away',
 'background',
 'bad',
 'band',
 'based',
 'basically',
 'batman',
 'battle',
 'beat',
 'beats',
 'beautiful',
 'begin',
 'beginning',
 'begins',
 'believable',
 'believe',
 'ben',
 'best',
 'better',
 'big',
 'biggest',
 'billy',
 'bit',
 'bizarre',
 'black',
 'bland',
 'block',
 'blood',
 'blowing',


In [57]:
lda.components_

array([[ 0.28315945,  0.27124487,  0.27663735, ...,  0.32814043,
         0.69365713,  0.30075536],
       [ 0.33743709,  0.28640084,  0.29053656, ...,  1.06588167,
         0.55751277,  0.2693008 ],
       [ 0.27306237,  0.56303444,  0.31955965, ...,  1.79487395,
         0.35782819,  0.2972864 ],
       ...,
       [ 0.69281208,  0.28347893,  0.27546929, ...,  0.28672924,
         0.3288247 ,  0.29222668],
       [ 2.2946055 ,  1.88999603,  1.89690196, ...,  2.13648223,
         2.57253875,  1.59171561],
       [27.30312347,  5.47126936,  2.82615098, ...,  2.38196565,
        33.23373513,  2.8096273 ]])

In [58]:
# function to print top words of topic model
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [59]:
print_top_words(lda, cv_terms, 25)


Topic #0:country political film story time movie african period like man right star told young history months known does bad new good end grant men know

Topic #1:like jackie movie steven guy ve film got money doesn secret country fight scenes didn relationship particularly movies special just lost things goes exactly block

Topic #2:carpenter midnight film looking red horror lots flashback train john writer work escape hollywood successful drug special science fiction effects planet called author york like

Topic #3:movie character like film big way time thing fine best performance life end john director good work scene stupid films characters actor cast action makes

Topic #4:john film scene fight best doesn hour removed films work better director carpenter die day brings taken white script big try lot boring flashback performance

Topic #5:film thing best time like fashion just end director action white face way shot movie scenes cast nearly tell race night interesting period actor

## NMF <a name="nmf"></a>

In [60]:
from sklearn.decomposition import NMF

In [61]:
# create instance of model, input number of topics to output
nmf = NMF(n_components=10)

In [62]:
# fit model to vectorized data
nmf.fit(tf_df)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=10, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [63]:
tf_terms

['10',
 '13',
 '1999',
 '20',
 '30',
 'ability',
 'able',
 'absolutely',
 'accent',
 'act',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adaptation',
 'add',
 'addition',
 'adds',
 'adult',
 'adults',
 'african',
 'age',
 'agent',
 'ago',
 'al',
 'alan',
 'alien',
 'allen',
 'allowing',
 'amazing',
 'america',
 'american',
 'amusing',
 'angry',
 'anthony',
 'antics',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'approach',
 'appropriate',
 'appropriately',
 'aren',
 'army',
 'art',
 'artist',
 'ask',
 'asked',
 'asks',
 'assured',
 'atmosphere',
 'attempt',
 'attempts',
 'attention',
 'audience',
 'author',
 'award',
 'away',
 'background',
 'bad',
 'band',
 'based',
 'basically',
 'batman',
 'battle',
 'beat',
 'beats',
 'beautiful',
 'begin',
 'beginning',
 'begins',
 'believable',
 'believe',
 'ben',
 'best',
 'better',
 'big',
 'biggest',
 'billy',
 'bit',
 'bizarre',
 'black',
 'bland',
 'block',
 'blood',
 'blowing',


In [64]:
nmf.components_

array([[0.        , 0.01589998, 0.00386707, ..., 0.00817008, 0.08498023,
        0.        ],
       [0.        , 0.02288594, 0.01792256, ..., 0.        , 0.04592642,
        0.        ],
       [0.76235773, 0.00441619, 0.        , ..., 0.        , 0.07681473,
        0.        ],
       ...,
       [0.        , 0.        , 0.03690171, ..., 0.        , 0.        ,
        0.03637616],
       [0.        , 0.        , 0.01642367, ..., 0.0330699 , 0.03136671,
        0.        ],
       [0.        , 0.        , 0.00242761, ..., 0.00829846, 0.04989281,
        0.        ]])

In [65]:
# function to print top words of topic model
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [66]:
print_top_words(nmf, tf_terms, 25)


Topic #0:film story people character city like men life history man black political shot begins matter makes great different country american look excellent just scenes make

Topic #1:movie like effects does star guy bad really didn special ve know think worst just computer seen wars doesn planet course people generated earth audience

Topic #2:10 film music band pretty movie just teen nice critique thing real cool dreams coming watching make simply biggest loves definitely don mind certain work

Topic #3:carpenter flashback john horror film red looking planet alien 13 fiction effects science train lots humans williams society escape special police somewhat control menace work

Topic #4:school comedy film high gets drive kids end like significant popular mr race funny going hand party fun watch crazy sex really teens boys teen

Topic #5:drug scene john midnight world friends eyes day living elizabeth art successful lives mother big expect new sex hopkins time play playing life michael

# word2vec <a name="w2v"></a>
Find similarity between words based on given corpus

In [67]:
import gensim

## Preprocess

In [68]:
from nltk.corpus import stopwords
from nltk import tokenize

In [69]:
stoplist = stopwords.words('english')
stoplist += ['.', ',', '(', ')', "'", '"']
stoplist = set(stop)

In [70]:
doc_words = [[word for word in tokenize.word_tokenize(document.lower()) if word not in stoplist]
         for document in documents]
doc_words[0:5]

[['films',
  'adapted',
  'comic',
  'books',
  'plenty',
  'success',
  'whether',
  'superheroes',
  'batman',
  'superman',
  'spawn',
  'geared',
  'toward',
  'kids',
  'casper',
  'arthouse',
  'crowd',
  'ghost',
  'world',
  'never',
  'really',
  'comic',
  'book',
  'like',
  'hell',
  'starters',
  'created',
  'alan',
  'moore',
  'eddie',
  'campbell',
  'brought',
  'medium',
  'whole',
  'new',
  'level',
  'mid',
  '80s',
  '12',
  '-',
  'part',
  'series',
  'called',
  'watchmen',
  'say',
  'moore',
  'campbell',
  'thoroughly',
  'researched',
  'subject',
  'jack',
  'ripper',
  'would',
  'like',
  'saying',
  'michael',
  'jackson',
  'starting',
  'look',
  'little',
  'odd',
  'book',
  '``',
  'graphic',
  'novel',
  '``',
  '500',
  'pages',
  'long',
  'includes',
  'nearly',
  '30',
  'consist',
  'nothing',
  'footnotes',
  'words',
  'dismiss',
  'film',
  'source',
  'get',
  'past',
  'whole',
  'comic',
  'book',
  'thing',
  'might',
  'find',
  'ano

## Model

### Continuous Bag of Words (CBOW)
Predicts word from context

In [71]:
# initialize model
model = gensim.models.Word2Vec(doc_words, size=100, window=5, min_count=1, workers=2)

In [72]:
# find similar words in corpus
model.wv.most_similar(positive='director')

  if np.issubdtype(vec.dtype, np.int):


[('movie', 0.9913743734359741),
 ('one', 0.9911948442459106),
 ('characters', 0.9911342859268188),
 ('film', 0.9910649657249451),
 ('much', 0.9906944036483765),
 ('-', 0.9906554222106934),
 ('``', 0.9904359579086304),
 ('--', 0.9903361797332764),
 ('like', 0.990320086479187),
 ('get', 0.9903019666671753)]

In [73]:
# rate similarity between two words in corpus
model.wv.similarity(w1='director', w2='actor')

  if np.issubdtype(vec.dtype, np.int):


0.9819221

### skip-gram
Predicts context from word

In [74]:
# initialize model
model = gensim.models.Word2Vec(doc_words, size=100, window=10, min_count=1, workers=2, sg=1)

In [75]:
# find similar words in corpus
model.wv.most_similar(positive='director')

  if np.issubdtype(vec.dtype, np.int):


[('writer', 0.9993840456008911),
 ('replacement', 0.9992160201072693),
 ('hidden', 0.9992149472236633),
 ('4', 0.9991885423660278),
 ('crouching', 0.9991792440414429),
 ('dragon', 0.9991413950920105),
 ('detroit', 0.9991388320922852),
 ('others', 0.9990844130516052),
 ('tiger', 0.9990746378898621),
 ('austin', 0.999071478843689)]

In [76]:
# rate similarity between two words in corpus
model.wv.similarity(w1='director', w2='actor')

  if np.issubdtype(vec.dtype, np.int):


0.99388874