In [1]:
import pandas as pd
import numpy as np


import string                                                             #all requried libraries

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [2]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df=pd.read_csv('movie_genre.csv')
df.head()

Unnamed: 0,Title,Year,Director,Duration,Rating,Votes,Description,Language,Country,Budget_USD,BoxOffice_USD,Genre,Production_Company,Content_Rating,Lead_Actor,Num_Awards,Critic_Reviews
0,Winds of Fate 4,1980,R. Lee,167,4.1,182425,A touching love story with heartwarming moments.,Spanish,China,39979615,179936008,Romance,DreamWorks,R,Kangana Ranaut,8,229
1,Firestorm 11,2014,S. Chen,166,4.1,449351,A fast-paced thriller with intense action scenes.,Korean,China,116404774,802121619,Action,Netflix,R,Kangana Ranaut,20,466
2,Silent Echo 2,2016,A. Khan,170,4.1,363328,A fast-paced thriller with intense action scenes.,Korean,Japan,166261330,225526871,Action,Pixar,PG,Amitabh Bachchan,16,539
3,City Lights 4,1982,L. Zhang,170,9.9,62371,An emotional journey exploring complex charact...,Japanese,Japan,28861315,69813738,Drama,Netflix,NC-17,Natalie Portman,15,606
4,Broken Truth 1,1990,L. Zhang,91,5.3,4600,An imaginative world filled with magic and won...,Korean,USA,43890403,375136716,Fantasy,Studio Ghibli,PG,Chris Evans,6,330


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               50000 non-null  object 
 1   Year                50000 non-null  int64  
 2   Director            50000 non-null  object 
 3   Duration            50000 non-null  int64  
 4   Rating              50000 non-null  float64
 5   Votes               50000 non-null  int64  
 6   Description         50000 non-null  object 
 7   Language            50000 non-null  object 
 8   Country             50000 non-null  object 
 9   Budget_USD          50000 non-null  int64  
 10  BoxOffice_USD       50000 non-null  int64  
 11  Genre               50000 non-null  object 
 12  Production_Company  50000 non-null  object 
 13  Content_Rating      50000 non-null  object 
 14  Lead_Actor          50000 non-null  object 
 15  Num_Awards          50000 non-null  int64  
 16  Crit

In [5]:
df.isnull().sum

In [6]:
df['Genre'].nunique()

7

In [7]:
df['Genre'].value_counts()

Unnamed: 0_level_0,count
Genre,Unnamed: 1_level_1
Horror,7260
Drama,7187
Romance,7169
Thriller,7118
Action,7107
Fantasy,7100
Comedy,7059


In [8]:
from nltk.tokenize import wordpunct_tokenize

stop_words = set(stopwords.words('english'))

def preprocess_text(text):

  text = text.lower()
  text = text.translate(str.maketrans('','', string.punctuation))
  tokens=wordpunct_tokenize(text)
  tokens=[word for word in tokens if word not in stop_words]
  return " ".join(tokens)

In [9]:
df['clean_text'] = df['Description']

df[['Description', 'clean_text']].head()

Unnamed: 0,Description,clean_text
0,A touching love story with heartwarming moments.,A touching love story with heartwarming moments.
1,A fast-paced thriller with intense action scenes.,A fast-paced thriller with intense action scenes.
2,A fast-paced thriller with intense action scenes.,A fast-paced thriller with intense action scenes.
3,An emotional journey exploring complex charact...,An emotional journey exploring complex charact...
4,An imaginative world filled with magic and won...,An imaginative world filled with magic and won...


In [10]:
X=df['clean_text']
y =df['Genre']

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
X_tfidf, y, test_size=0.2, random_state=42)



In [12]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [13]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))



Accuracy: 1.0
              precision    recall  f1-score   support

      Action       1.00      1.00      1.00      1399
      Comedy       1.00      1.00      1.00      1447
       Drama       1.00      1.00      1.00      1448
     Fantasy       1.00      1.00      1.00      1430
      Horror       1.00      1.00      1.00      1485
     Romance       1.00      1.00      1.00      1429
    Thriller       1.00      1.00      1.00      1362

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



In [14]:
def predict_genre(text):
    text = preprocess_text(text)
    text_vector = tfidf.transform([text])
    prediction = model.predict(text_vector)
    return prediction[0]


In [15]:
sample_plot = "A group of friends go on a thrilling adventure filled with danger and excitement."
print("Predicted Genre:", predict_genre(sample_plot))


Predicted Genre: Thriller
