In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
import joblib

In [2]:
import os
import pandas as pd

csv_path = os.path.join("..", "data", "movies_prepared.csv")
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print(df)
else:
    print(f"File not found: {csv_path}")

        ID                           Movie Name  Rating  Runtime  \
0        1             The Shawshank Redemption     9.3  142 min   
1        2                        The Godfather     9.2  175 min   
2        3  Ramayana: The Legend of Prince Rama     9.2  135 min   
3        4                      The Chaos Class     9.2   87 min   
4        5                                Daman     9.1  121 min   
...    ...                                  ...     ...      ...   
9994  9995                            Yogi Bear     4.6   80 min   
9995  9996                                Troll     4.6   82 min   
9996  9997                Paranormal Activity 4     4.6   88 min   
9997  9998                          Dark Crimes     4.6   92 min   
9998  9999                          The Pyramid     4.6   89 min   

                                     Genre  Metascore  \
0                                ['Drama']       82.0   
1                       ['Crime', 'Drama']      100.0   
2     ['Anim

In [5]:
X_text = df["Movie Name"] + " " + df["Plot"]

# Load multi-label binarizer
mlb_path = os.path.join("..", "data", "genre_mlb.pkl")
mlb = joblib.load(mlb_path)

# Target
y = mlb.transform(df["Genre"].apply(eval))  # eval because saved as string list

# TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(X_text)

# Train model
clf = MultiOutputClassifier(LogisticRegression(max_iter=1000))
clf.fit(X, y)

# Ensure 'models' directory exists
os.makedirs("models", exist_ok=True)

# Save model + vectorizer
joblib.dump(clf, "models/genre_mood_model.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")

print("✅ Model trained and saved.")

✅ Model trained and saved.


In [6]:
# Example: Predict genres for a new movie description
test_movie_name = "A Space Adventure"
test_plot = "A group of astronauts embark on a journey to explore a mysterious planet beyond our solar system."

# Combine name and plot as done during training
test_text = test_movie_name + " " + test_plot

# Transform using the saved vectorizer
X_test = vectorizer.transform([test_text])

# Predict genres
y_pred = clf.predict(X_test)

# Get genre labels
predicted_genres = mlb.inverse_transform(y_pred)

print("Predicted genres:", predicted_genres)

Predicted genres: [('Adventure',)]
