In [61]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [62]:
df = pd.read_csv("movie_genre.csv")   # change filename if needed
df.head()


Unnamed: 0,Title,Year,Director,Duration,Rating,Votes,Description,Language,Country,Budget_USD,BoxOffice_USD,Genre,Production_Company,Content_Rating,Lead_Actor,Num_Awards,Critic_Reviews
0,Winds of Fate 4,1980,R. Lee,167,4.1,182425,A touching love story with heartwarming moments.,Spanish,China,39979615,179936008,Romance,DreamWorks,R,Kangana Ranaut,8,229
1,Firestorm 11,2014,S. Chen,166,4.1,449351,A fast-paced thriller with intense action scenes.,Korean,China,116404774,802121619,Action,Netflix,R,Kangana Ranaut,20,466
2,Silent Echo 2,2016,A. Khan,170,4.1,363328,A fast-paced thriller with intense action scenes.,Korean,Japan,166261330,225526871,Action,Pixar,PG,Amitabh Bachchan,16,539
3,City Lights 4,1982,L. Zhang,170,9.9,62371,An emotional journey exploring complex charact...,Japanese,Japan,28861315,69813738,Drama,Netflix,NC-17,Natalie Portman,15,606
4,Broken Truth 1,1990,L. Zhang,91,5.3,4600,An imaginative world filled with magic and won...,Korean,USA,43890403,375136716,Fantasy,Studio Ghibli,PG,Chris Evans,6,330


In [63]:
print(df.shape)
print(df['Genre'].value_counts())


(50000, 17)
Genre
Horror      7260
Drama       7187
Romance     7169
Thriller    7118
Action      7107
Fantasy     7100
Comedy      7059
Name: count, dtype: int64


In [64]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)


In [65]:
df['clean_plot'] = df['Description'].apply(preprocess_text)
df[['Description', 'clean_plot']].head()


Unnamed: 0,Description,clean_plot
0,A touching love story with heartwarming moments.,touching love story heartwarming moment
1,A fast-paced thriller with intense action scenes.,fastpaced thriller intense action scene
2,A fast-paced thriller with intense action scenes.,fastpaced thriller intense action scene
3,An emotional journey exploring complex charact...,emotional journey exploring complex character
4,An imaginative world filled with magic and won...,imaginative world filled magic wonder


In [66]:
X = df['clean_plot']
y = df['Genre']


In [67]:
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [68]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)


In [69]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [70]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

      Action       1.00      1.00      1.00      1421
      Comedy       1.00      1.00      1.00      1412
       Drama       1.00      1.00      1.00      1437
     Fantasy       1.00      1.00      1.00      1420
      Horror       1.00      1.00      1.00      1452
     Romance       1.00      1.00      1.00      1434
    Thriller       1.00      1.00      1.00      1424

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



In [71]:
sample = "A fast-paced thriller with intense action scenes"
sample_clean = preprocess_text(sample)
sample_vec = vectorizer.transform([sample_clean])

print("Predicted Genre:", model.predict(sample_vec)[0])


Predicted Genre: Action
