In [4]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from imblearn.over_sampling import RandomOverSampler
from scipy.sparse import vstack
import numpy as np
import warnings

# Suppress all warnings (including UserWarnings)
warnings.simplefilter("ignore")

# Step 1: Load the Dataset
data = pd.read_csv(r"D:\CODSOFT\extracted_data\tmdb_5000_movies.csv")

# Step 2: Preprocess the 'genres' column
def extract_genres(genre_str):
    try:
        genres_list = ast.literal_eval(genre_str)
        if genres_list and isinstance(genres_list, list):
            return [genre['name'] for genre in genres_list]
        return ["Unknown"]
    except:
        return ["Unknown"]

data['processed_genres'] = data['genres'].apply(extract_genres)

# Step 3: Prepare Features (X) and Multi-Label Targets (y)
X = data['overview'].fillna("No overview available").str.lower()
y = data['processed_genres']

# Use MultiLabelBinarizer to convert genres into binary format
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Convert Text to TF-IDF Features
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 6: Train Separate Logistic Regression Models for Each Genre
models = {}
reports = {}

for i, genre in enumerate(mlb.classes_):
    print(f"Training model for genre: {genre}")
    
    # Oversample data for the current genre
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_train_tfidf, y_train[:, i])
    
    # Train logistic regression for the current genre
    model = LogisticRegression(max_iter=1000)
    model.fit(X_resampled, y_resampled)
    
    # Save the trained model
    models[genre] = model
    
    # Predict on the test set for the current genre
    y_pred = model.predict(X_test_tfidf)
    
    # Save classification report with zero_division parameter to avoid warnings
    report = classification_report(y_test[:, i], y_pred, output_dict=True, zero_division=1)
    reports[genre] = report

# Step 7: Display Combined Classification Report for All Genres
for genre, report in reports.items():
    print(f"--- Classification report for genre: {genre} ---")
    print(classification_report(y_test[:, mlb.classes_ == genre], models[genre].predict(X_test_tfidf), zero_division=1))

# Step 8: Predict Genres for a New Plot
new_plot = ["A young wizard embarks on an adventure to defeat a dark lord."]
new_plot_tfidf = tfidf.transform(new_plot)

predicted_genres = []
for genre, model in models.items():
    if model.predict(new_plot_tfidf)[0] == 1:
        predicted_genres.append(genre)

print(f"Predicted Genre(s): {predicted_genres}")


Training model for genre: Action
Training model for genre: Adventure
Training model for genre: Animation
Training model for genre: Comedy
Training model for genre: Crime
Training model for genre: Documentary
Training model for genre: Drama
Training model for genre: Family
Training model for genre: Fantasy
Training model for genre: Foreign
Training model for genre: History
Training model for genre: Horror
Training model for genre: Music
Training model for genre: Mystery
Training model for genre: Romance
Training model for genre: Science Fiction
Training model for genre: TV Movie
Training model for genre: Thriller
Training model for genre: Unknown
Training model for genre: War
Training model for genre: Western
--- Classification report for genre: Action ---
              precision    recall  f1-score   support

           0       0.88      0.82      0.85       709
           1       0.57      0.69      0.63       252

    accuracy                           0.78       961
   macro avg    