In [1]:
import nltk
import pandas as pd

In [2]:
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/pvankessel/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
rows = []
for fileid in nltk.corpus.movie_reviews.fileids():
    rows.append({"text": nltk.corpus.movie_reviews.raw(fileid)})
df = pd.DataFrame(rows)
print(len(df))

2000


In [4]:

vectorizer = TfidfVectorizer(
    max_df=0.5,         # Ignore terms that appear in more than 50% of the documents
    min_df=10,          # Ignore terms that appear in fewer than 10 documents
    max_features=None,  # Consider all features (no limit on the number of features)
    ngram_range=(1, 2), # Consider both unigrams (single words) and bigrams (pairs of words)
    norm=None,          # No normalization applied to the term vectors
    binary=True,        # Use binary occurrence instead of term frequency
    use_idf=False,      # Do not use Inverse Document Frequency (IDF) weighting
    sublinear_tf=False  # Do not apply sublinear term frequency scaling
)

# Learn the vocabulary and term statistics
vectorizer = vectorizer.fit(df['text'])

# Transform the text data into TF-IDF features
tfidf = vectorizer.transform(df['text'])

# Get the feature names (vocabulary) from the vectorizer
vocab = vectorizer.get_feature_names()

# Print the number of features (vocabulary size)
print(len(vocab))


21886


In [5]:
from corextopic import corextopic as ct

In [6]:
anchors = []
model = ct.Corex(n_hidden=8, seed=42)  # 8 hidden topics
model = model.fit(
    tfidf,
    words=vocab
)

In [7]:
# For each topic discovered by the Corex model, retrieving the top 10 n-grams for each topic
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    
    # Filter out n-grams with positive weights and extract the n-gram text
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i + 1, ", ".join(topic_ngrams)))


Topic #1: see, me, had, really, don, know, think, my, because, how
Topic #2: life, he is, both, never, it is, of his, that he, world, performance, to his
Topic #3: the first, the most, films, from the, many, by the, since, such, at the, while
Topic #4: comedy, funny, jokes, humor, laughs, funniest, the funniest, hilarious, the jokes, joke
Topic #5: young, opening, music, follow, portrayal, cinematography, mars, aspect, art, shown
Topic #6: murder, crime, thriller, police, killer, dead, the police, he has, turns, prison
Topic #7: plot, action, case, critique, the plot, suspense, none, blair witch, seem, cool
Topic #8: horror, horror film, scream, slasher, did last, horror films, scary, you did, williamson


In [8]:
# Anchors designed to nudge the model towards measuring specific genres
anchors = [
    ["action", "adventure"],
    ["drama"],
    ["comedy", "funny"],
    ["horror", "suspense"],
    ["animated", "animation"],
    ["sci fi", "alien"],
    ["romance", "romantic"],
    ["fantasy"]
]
# Filter the anchors to include only words present in the vocabulary
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchors
]

# Instantiate the Corex model with 8 hidden topics and a random seed for reproducibility
model = ct.Corex(n_hidden=8, seed=42)

# Fit the Corex model to the TF-IDF matrix, using the vocabulary and filtered anchors
model = model.fit(
    tfidf,        # The TF-IDF matrix generated from the text data
    words=vocab,  # The vocabulary list corresponding to the TF-IDF features
    anchors=anchors,  # Pass the filtered anchors to guide topic discovery
    anchor_strength=3 # Set the anchor strength to 3 to control reliance on anchors
)



In [9]:
# For each topic discovered by the Corex model, retrieving the top 10 n-grams for each topic
# again filtering n-grams with positive weights
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: action, adventure, the action, scenes, action sequences, where, action scenes, an action, action film, sequences
Topic #2: drama, performance, mother, director, both, while, and his, to his, role, performances
Topic #3: comedy, funny, jokes, laughs, humor, funny and, hilarious, very funny, gags, laugh
Topic #4: horror, really, think, had, me, did, how, see, because, were
Topic #5: animated, animation, disney, children, the animation, computer, adults, years, voice of, voice
Topic #6: alien, sci fi, effects, special effects, fi, aliens, sci, planet, special, earth
Topic #7: romantic, romance, she, love, with her, of her, that she, relationship, woman, romantic comedy
Topic #8: life, he is, fantasy, world, it is, that the, perhaps, point, does, through


## Add labels back to the datframe for topic attribution

In [10]:
# Create a DataFrame from the Corex model's topic assignments for the documents
topic_df = pd.DataFrame(
    model.transform(tfidf),  # Transform the TF-IDF matrix into topic associations
    columns=["topic_{}".format(i + 1) for i in range(8)]  # Name the columns for each of the 8 topics
).astype(float)  # Ensure the data type is float

# Set the index of topic_df to match the index of the original DataFrame df
topic_df.index = df.index

# Concatenate the original DataFrame df with the new topic_df along the columns
df = pd.concat([df, topic_df], axis=1)


In [11]:
df.sample(5, random_state=42)

Unnamed: 0,text,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8
1860,the verdict : spine-chilling drama from horror...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
353,""" the 44 caliber killer has struck again . "" ...",0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1333,in the company of men made a splash at the sun...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
905,"in the year 2029 , captain leo davidson ( mark...",0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1289,[note that followups are directed to rec . art...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
