# 1. Import Libraries

In [None]:
!pip uninstall -y gensim numpy scipy

In [1]:
!pip install gensim==4.3.1 numpy==1.23.5 scipy==1.10.1



In [16]:
!pip install -q spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import math
import random

import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
from nltk.corpus import stopwords

import spacy
nlp = spacy.load("en_core_web_sm")

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

from collections import defaultdict
from datetime import datetime
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold, cross_val_score

from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Load the Dataset

In [2]:
path = '/content/movie_genre_classification_final.csv'
dataset = pd.read_csv(path)
print("Dataset Loaded:\n", dataset.head())

Dataset Loaded:
              Title  Year  Director  Duration  Rating   Votes  \
0  Winds of Fate 4  1980    R. Lee       167     4.1  182425   
1     Firestorm 11  2014   S. Chen       166     4.1  449351   
2    Silent Echo 2  2016   A. Khan       170     4.1  363328   
3    City Lights 4  1982  L. Zhang       170     9.9   62371   
4   Broken Truth 1  1990  L. Zhang        91     5.3    4600   

                                         Description  Language Country  \
0   A touching love story with heartwarming moments.   Spanish   China   
1  A fast-paced thriller with intense action scenes.    Korean   China   
2  A fast-paced thriller with intense action scenes.    Korean   Japan   
3  An emotional journey exploring complex charact...  Japanese   Japan   
4  An imaginative world filled with magic and won...    Korean     USA   

   Budget_USD  BoxOffice_USD    Genre Production_Company Content_Rating  \
0    39979615      179936008  Romance         DreamWorks              R   
1  

In [3]:
print(dataset.shape)

(50000, 17)


In [4]:
print(dataset.isnull().sum())

Title                 0
Year                  0
Director              0
Duration              0
Rating                0
Votes                 0
Description           0
Language              0
Country               0
Budget_USD            0
BoxOffice_USD         0
Genre                 0
Production_Company    0
Content_Rating        0
Lead_Actor            0
Num_Awards            0
Critic_Reviews        0
dtype: int64


In [5]:
print(dataset.dtypes)

Title                  object
Year                    int64
Director               object
Duration                int64
Rating                float64
Votes                   int64
Description            object
Language               object
Country                object
Budget_USD              int64
BoxOffice_USD           int64
Genre                  object
Production_Company     object
Content_Rating         object
Lead_Actor             object
Num_Awards              int64
Critic_Reviews          int64
dtype: object


# 3. Preprocess the Dataset

In [24]:
# Step 1: Preprocess descriptions
stop_words = set(stopwords.words('english'))

def clean_text(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha and token.lemma_ not in stop_words and len(token) > 2
    ]
    return " ".join(tokens)

dataset['Cleaned_Description'] = dataset['Description'].astype(str).apply(clean_text)

# Step 2: Split into features and labels
X = dataset['Cleaned_Description']
y = dataset['Genre']

In [25]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_df=0.99, min_df=2)

# Fit on training data only
X = vectorizer.fit_transform(X)

In [26]:
print(y.unique())

['Romance' 'Action' 'Drama' 'Fantasy' 'Comedy' 'Thriller' 'Horror']


In [27]:
# Tokenized docs for coherence
tokenized_docs = [doc.split() for doc in dataset.loc[y.index, 'Cleaned_Description']]

# Gensim dictionary and corpus
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

# 4. Reference Latent Dirichlet Allocation (LDA)

In [77]:
n_topics = 7
lda_ref = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=10,
    learning_method='online',
    random_state=42
)

start_time = datetime.now()
lda_ref.fit(X, y)
end_time = datetime.now()

execution_time = (end_time - start_time).microseconds
print(f"Execution Time: {execution_time} mcs")

Execution Time: 97765 mcs


In [78]:
def display_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic #{topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

display_topics(lda_ref, vectorizer.get_feature_names_out())


Topic #1:
love touching moment heartwarming story fill guarantee laughter comedy hearted

Topic #2:
journey character explore complex emotional fill light laughter hearted comedy

Topic #3:
action thriller intense pace scene fast fill guarantee light laughter

Topic #4:
evoke fear tale spine chilling dread fill light comedy laughter

Topic #5:
unexpected twist suspenseful plot light hearted comedy laughter guarantee fill

Topic #6:
fill suspenseful twist unexpected plot guarantee hearted laughter comedy light

Topic #7:
world wonder magic imaginative fill twist unexpected plot suspenseful light


In [79]:
# Extract topics from scikit-learn LDA model
def get_sklearn_topics(model, vectorizer, n_top_words=10):
    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for topic_weights in model.components_:
        top_features = [feature_names[i] for i in topic_weights.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_features)
    return topics

topics = get_sklearn_topics(lda_ref, vectorizer)

In [80]:
# Compute coherence score
coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print(f"Topic Coherence (c_v): {coherence_score:.4f}")

Topic Coherence (c_v): 0.2307


# 5. Custom Latent Dirichlet Allocation (LDA)

In [81]:
class CustomLatentDirichletAllocation:
    def __init__(self, n_topics=7, alpha=0.1, beta=0.01, n_iter=100):
        self.n_topics = n_topics
        self.alpha = alpha
        self.beta = beta
        self.n_iter = n_iter

    def fit(self, corpus, dictionary):
        self.dictionary = dictionary
        self.V = len(dictionary)
        self.D = len(corpus)
        self.Z = []  # topic assignments
        self.n_dk = np.zeros((self.D, self.n_topics)) + self.alpha  # doc-topic
        self.n_kw = np.zeros((self.n_topics, self.V)) + self.beta   # topic-word
        self.n_k = np.zeros(self.n_topics) + self.V * self.beta     # total topic counts

        # Randomly initialize topic assignments
        for d, doc in enumerate(corpus):
            z_current = []
            for (word_id, count) in doc:
                for _ in range(count):
                    topic = random.randint(0, self.n_topics - 1)
                    z_current.append(topic)
                    self.n_dk[d, topic] += 1
                    self.n_kw[topic, word_id] += 1
                    self.n_k[topic] += 1
            self.Z.append(z_current)

        # Begin Gibbs sampling
        for it in tqdm(range(self.n_iter), desc="Training Custom LDA"):
            for d, doc in enumerate(corpus):
                word_pos = 0
                for word_id, count in doc:
                    for _ in range(count):
                        topic = self.Z[d][word_pos]

                        # Decrease counts
                        self.n_dk[d, topic] -= 1
                        self.n_kw[topic, word_id] -= 1
                        self.n_k[topic] -= 1

                        # Sample new topic
                        p_z = (self.n_kw[:, word_id] / self.n_k) * (self.n_dk[d])
                        p_z /= np.sum(p_z)
                        new_topic = np.random.choice(self.n_topics, p=p_z)

                        # Update
                        self.Z[d][word_pos] = new_topic
                        self.n_dk[d, new_topic] += 1
                        self.n_kw[new_topic, word_id] += 1
                        self.n_k[new_topic] += 1

                        word_pos += 1

    def get_topics(self, top_n=10):
        topics = []
        for k in range(self.n_topics):
            top_word_ids = self.n_kw[k].argsort()[::-1][:top_n]
            topic_words = [self.dictionary[i] for i in top_word_ids]
            topics.append(topic_words)
        return topics

In [88]:
n_topics = 7
lda_cus = CustomLatentDirichletAllocation(n_topics=n_topics, n_iter=100)

start_time = datetime.now()
lda_cus.fit(corpus, dictionary)
end_time = datetime.now()

execution_time = (end_time - start_time).microseconds
print(f"\nExecution Time: {execution_time} mcs")

Training Custom LDA: 100%|██████████| 100/100 [15:25<00:00,  9.25s/it]


Execution Time: 847394 mcs





In [89]:
custom_topics = lda_cus.get_topics(top_n=10)

# Display topics
for i, topic in enumerate(custom_topics, 1):
    print(f"\nTopic #{i}: {' '.join(topic)}")


Topic #1: laughter light comedy guarantee hearted fill magic wonder world imaginative

Topic #2: fast intense scene thriller pace action fill touching story spine

Topic #3: tale spine fear evoke dread chilling love moment story touching

Topic #4: journey explore emotional complex character heartwarming love moment story touching

Topic #5: fill unexpected twist suspenseful plot intense emotional complex character thriller

Topic #6: action pace thriller scene intense fast fill touching story spine

Topic #7: imaginative world fill magic wonder spine tale chilling dread evoke


In [91]:
coherence_custom = CoherenceModel(
    topics=custom_topics,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'
).get_coherence()

print(f"\nCustom LDA Topic Coherence (c_v): {coherence_custom:.4f}")


Custom LDA Topic Coherence (c_v): 0.2269
