In [7]:
import nltk
import pandas as pd

In [8]:
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Joe\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [9]:


rows = []
for fileid in nltk.corpus.movie_reviews.fileids():
    rows.append({"text": nltk.corpus.movie_reviews.raw(fileid)})
df = pd.DataFrame(rows)
print(len(df))



2000


In [10]:
df.head()

Unnamed: 0,text
0,"plot : two teen couples go to a church party ,..."
1,the happy bastard's quick movie review \ndamn ...
2,it is movies like these that make a jaded movi...
3,""" quest for camelot "" is warner bros . ' firs..."
4,synopsis : a mentally unstable man undergoing ...


In [11]:


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)
vectorizer = vectorizer.fit(df['text'])
tfidf = vectorizer.transform(df['text'])
vocab = vectorizer.get_feature_names()
print(len(vocab))



21886


In [12]:
from corextopic import corextopic as ct

In [13]:


anchors = []
model = ct.Corex(n_hidden=8, seed=42)
model = model.fit(
    tfidf,
    words=vocab
)



In [14]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: see, me, had, really, don, know, think, my, because, how
Topic #2: life, he is, both, never, it is, of his, that he, world, performance, to his
Topic #3: the first, the most, films, from the, many, by the, since, such, at the, while
Topic #4: opening, music, cinematography, art, follow, portrayal, mars, aspect, shown, herself
Topic #5: comedy, funny, humor, jokes, laughs, funniest, the funniest, hilarious, the jokes, joke
Topic #6: murder, crime, thriller, police, killer, the police, he has, turns, dead, prison
Topic #7: plot, action, case, critique, the plot, suspense, none, seem, blair witch, cool
Topic #8: horror, horror film, scream, slasher, did last, horror films, scary, you did, williamson


In [18]:
# Anchors designed to nudge the model towards measuring specific genres
anchors = [
    ["action", "adventure"],
    ["drama"],
    ["comedy", "funny"],
    ["horror", "suspense"],
    ["animated", "animation"],
    ["sci fi", "alien"],
    ["romance", "romantic"],
    ["fantasy"]
]
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchors
]

model = ct.Corex(n_hidden=8, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors, # Pass the anchors in here
    anchor_strength=3 # Tell the model how much it should rely on the anchors
)

In [19]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))




Topic #1: action, adventure, the action, scenes, action sequences, where, action scenes, an action, action film, sequences
Topic #2: drama, performance, mother, director, both, while, and his, to his, role, performances
Topic #3: comedy, funny, jokes, laughs, humor, funny and, hilarious, very funny, gags, laugh
Topic #4: horror, really, think, had, me, did, how, see, because, were
Topic #5: animated, animation, disney, children, the animation, computer, adults, years, voice of, voice
Topic #6: alien, sci fi, effects, special effects, fi, aliens, sci, planet, special, earth
Topic #7: romantic, romance, she, love, with her, of her, that she, relationship, woman, romantic comedy
Topic #8: life, he is, fantasy, world, it is, that the, perhaps, point, does, through


In [20]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(8)]
).astype(float)
topic_df.index = df.index
df = pd.concat([df, topic_df], axis=1)

In [21]:
df.sample(5, random_state=42)

Unnamed: 0,text,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_1.1,topic_2.1,topic_3.1,topic_4.1,topic_5.1,topic_6.1,topic_7.1,topic_8.1
1860,the verdict : spine-chilling drama from horror...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
353,""" the 44 caliber killer has struck again . "" ...",0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1333,in the company of men made a splash at the sun...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
905,"in the year 2029 , captain leo davidson ( mark...",0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1289,[note that followups are directed to rec . art...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
