**NAME : ABHINAV KRISHNA B**


**LOAD LIBRARIES AND CONVERT TEXT DATASET INTO CSV FILE**

In [None]:
import pandas as pd

train_file_path = '/content/drive/MyDrive/Genre Classification Dataset/train_data.txt'
test_file_path = '/content/drive/MyDrive/Genre Classification Dataset/test_data.txt'
test_solution_file_path ='/content/drive/MyDrive/Genre Classification Dataset/test_data_solution.txt'

train_data = pd.read_csv(train_file_path, sep=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
test_data = pd.read_csv(test_file_path, sep=' ::: ', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])
test_data_solution = pd.read_csv(test_solution_file_path, sep=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])


print(train_data.head())
print(test_data.head())
print(test_data_solution.head())

   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
   ID                        TITLE  \
0   1         Edgar's Lunch (1998)   
1   2     La guerra de papá (1977)   
2   3  Off the Beaten Track (2010)   
3   4       Meu Amigo Hindu (2015)   
4   5            Er nu zhai (1955)   

                                         DESCRIPTION  
0  L.R. Brane loves his life - his car, his apart...  
1  Spain, Ma

**IMPORT NLTK LIBRARY FOR PREPROCESSING OF THE MOVIE SYNOPSIS**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lower case
    tokens = [token.lower() for token in tokens]
    # Remove punctuation
    tokens = [token for token in tokens if token.isalnum()]
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the plot descriptions
train_data['DESCRIPTION_CLEANED'] = train_data['DESCRIPTION'].apply(preprocess_text)
test_data['DESCRIPTION_CLEANED'] = test_data['DESCRIPTION'].apply(preprocess_text)
test_data_solution['DESCRIPTION_CLEANED'] = test_data_solution['DESCRIPTION'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**TFID VECTORIZER IS APPLIED FOR FEATURE EXTRACTION FROM THE TEXT**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['DESCRIPTION_CLEANED'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['DESCRIPTION_CLEANED'])

# Extract the labels
y_train = train_data['GENRE']


**MODEL TRANING PHASE (LOGISTIC , SVM , NAIVE BAYES)**

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Initialize the classifiers
logistic_regression = LogisticRegression(max_iter=1000)
naive_bayes = MultinomialNB()
svm = SVC(kernel='linear')

# Train the classifiers
logistic_regression.fit(X_train_tfidf, y_train)
naive_bayes.fit(X_train_tfidf, y_train)
svm.fit(X_train_tfidf, y_train)


**MODEL TESTING AND HYPERPARANETRIC TUNINH PHASE**

In [None]:
y_pred_lr = logistic_regression.predict(X_test_tfidf)
y_pred_nb = naive_bayes.predict(X_test_tfidf)
y_pred_svm = svm.predict(X_test_tfidf)

test_data_solution['PREDICTED_GENRE_LR'] = y_pred_lr
test_data_solution['PREDICTED_GENRE_NB'] = y_pred_nb
test_data_solution['PREDICTED_GENRE_SVM'] = y_pred_svm

# Evaluate the models
from sklearn.metrics import accuracy_score, classification_report

# Calculate and print the evaluation metrics
print("Logistic Regression Accuracy:", accuracy_score(test_data_solution['GENRE'], test_data_solution['PREDICTED_GENRE_LR']))
print("Logistic Regression Classification Report:\n", classification_report(test_data_solution['GENRE'], test_data_solution['PREDICTED_GENRE_LR']))

print("Naive Bayes Accuracy:", accuracy_score(test_data_solution['GENRE'], test_data_solution['PREDICTED_GENRE_NB']))
print("Naive Bayes Classification Report:\n", classification_report(test_data_solution['GENRE'], test_data_solution['PREDICTED_GENRE_NB']))

print("SVM Accuracy:", accuracy_score(test_data_solution['GENRE'], test_data_solution['PREDICTED_GENRE_SVM']))
print("SVM Classification Report:\n", classification_report(test_data_solution['GENRE'], test_data_solution['PREDICTED_GENRE_SVM']))


Logistic Regression Accuracy: 0.5844464944649447


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression Classification Report:
               precision    recall  f1-score   support

      action       0.48      0.28      0.36      1314
       adult       0.58      0.23      0.33       590
   adventure       0.56      0.15      0.24       775
   animation       0.49      0.06      0.11       498
   biography       0.00      0.00      0.00       264
      comedy       0.52      0.58      0.55      7446
       crime       0.31      0.03      0.06       505
 documentary       0.68      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.47      0.08      0.14       783
     fantasy       0.53      0.05      0.10       322
   game-show       0.90      0.52      0.66       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.56      0.60      2204
       music       0.64      0.43      0.51       731
     musical       0.24      0.02      0.03       276
     mystery       0.38      0.01    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes Classification Report:
               precision    recall  f1-score   support

      action       0.57      0.10      0.17      1314
       adult       0.45      0.05      0.09       590
   adventure       0.75      0.06      0.12       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.51      0.42      0.46      7446
       crime       0.00      0.00      0.00       505
 documentary       0.57      0.87      0.69     13096
       drama       0.46      0.83      0.59     13612
      family       1.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.97      0.31      0.46       193
     history       0.00      0.00      0.00       243
      horror       0.70      0.34      0.46      2204
       music       0.75      0.15      0.25       731
     musical       0.00      0.00      0.00       276
     mystery       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Classification Report:
               precision    recall  f1-score   support

      action       0.43      0.35      0.39      1314
       adult       0.57      0.34      0.43       590
   adventure       0.48      0.20      0.28       775
   animation       0.43      0.14      0.21       498
   biography       0.00      0.00      0.00       264
      comedy       0.52      0.58      0.55      7446
       crime       0.24      0.05      0.08       505
 documentary       0.68      0.84      0.75     13096
       drama       0.55      0.75      0.64     13612
      family       0.47      0.10      0.16       783
     fantasy       0.42      0.09      0.15       322
   game-show       0.81      0.62      0.70       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.58      0.61      2204
       music       0.64      0.50      0.56       731
     musical       0.26      0.03      0.06       276
     mystery       0.35      0.02      0.04       318

  _warn_prf(average, modifier, msg_start, len(result))


**REALTIME PREDICTION USING THE BEST FIT MODEL**

In [None]:
def predict_genre(plot, model, vectorizer):
    # Preprocess the plot
    plot_cleaned = preprocess_text(plot)
    # Convert to TF-IDF features
    plot_tfidf = vectorizer.transform([plot_cleaned])
    # Predict the genre
    predicted_genre = model.predict(plot_tfidf)
    return predicted_genre[0]

plot = "Things start to take an awry turn for a mild-mannered cafe owner, who gets caught in the crosshairs of a drug cartel."

# Using the logistic regression model for prediction

predicted_genre_lr = predict_genre(plot, logistic_regression, tfidf_vectorizer)
print("Predicted Genre (Logistic Regression):", predicted_genre_lr)

Predicted Genre (Logistic Regression): drama


In [None]:
plot = "A group of astronauts embark on a journey to the outer reaches of the solar system to find a new habitable planet after Earth becomes uninhabitable."

# Using the logistic regression model for prediction

predicted_genre_lr = predict_genre(plot, logistic_regression, tfidf_vectorizer)
print("Predicted Genre (Logistic Regression):", predicted_genre_lr)

Predicted Genre (Logistic Regression): sci-fi
