In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import hamming_loss
import matplotlib

In [93]:
movies_df = pd.read_csv('data/movies_df.csv')

In [94]:
movies_df.head(2)

Unnamed: 0,title,synopsis,Adventure,Action,Romance,Thriller,Drama,Horror,Science Fiction,Comedy
0,Expend4bles,Armed with every weapon they can get their han...,1,1,0,1,0,0,0,0
1,Mission: Impossible - Dead Reckoning Part One,Ethan Hunt and his IMF team embark on their mo...,0,1,0,1,0,0,0,0


In [95]:
movies_df.shape

(30329, 10)

In [96]:
movies_df.head(2)

Unnamed: 0,title,synopsis,Adventure,Action,Romance,Thriller,Drama,Horror,Science Fiction,Comedy
0,Expend4bles,Armed with every weapon they can get their han...,1,1,0,1,0,0,0,0
1,Mission: Impossible - Dead Reckoning Part One,Ethan Hunt and his IMF team embark on their mo...,0,1,0,1,0,0,0,0


In [97]:
movies_df.iloc[:,2:].sum()

Adventure           6085
Action              7838
Romance             6286
Thriller            7557
Drama              11413
Horror              6605
Science Fiction     5937
Comedy              9025
dtype: int64

In [98]:
def genres_to_list(row):
    genres = [col for col in movies_df.columns if col not in ['title', 'synopsis'] and row[col] == 1]
    return genres

# Apply the function to each row
movies_df['labels'] = movies_df.apply(genres_to_list, axis=1)

In [99]:
movies_df['string_labels'] = movies_df['labels'].astype(str)

In [100]:
movies_df['labels'].head(2)

0    [Adventure, Action, Thriller]
1               [Action, Thriller]
Name: labels, dtype: object

In [101]:
movies_df.head(2)

Unnamed: 0,title,synopsis,Adventure,Action,Romance,Thriller,Drama,Horror,Science Fiction,Comedy,labels,string_labels
0,Expend4bles,Armed with every weapon they can get their han...,1,1,0,1,0,0,0,0,"[Adventure, Action, Thriller]","['Adventure', 'Action', 'Thriller']"
1,Mission: Impossible - Dead Reckoning Part One,Ethan Hunt and his IMF team embark on their mo...,0,1,0,1,0,0,0,0,"[Action, Thriller]","['Action', 'Thriller']"


# Model building

In [102]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(movies_df['synopsis'], movies_df['labels'], test_size=0.2, random_state=42)

In [103]:
# Define the pipeline for logistic regression
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=15000)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=500)))
])

In [104]:
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=15000)),
    ('clf', OneVsRestClassifier(RandomForestClassifier(n_jobs=-1, n_estimators=200)))
])

In [105]:
# Convert labels using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [106]:
# Train the model using the logistic regression pipeline
pipeline_lr.fit(X_train, y_train)

In [107]:
pipeline_rf.fit(X_train, y_train)

In [108]:
# Predictions can be made through the pipeline
y_pred_lr = pipeline_lr.predict(X_test)
y_pred_rf = pipeline_rf.predict(X_test)

In [109]:
# Evaluate the model
lr_loss = hamming_loss(y_test, y_pred_lr)
rf_loss = hamming_loss(y_test, y_pred_rf)
print(f"Logistic Regression Hamming Loss: {lr_loss}")
print(f"Random Forest Hamming Loss: {rf_loss}")

Logistic Regression Hamming Loss: 0.17931915595120343
Random Forest Hamming Loss: 0.2067466205077481


In [110]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [111]:
print(f'Logistic Regression Precision score: {precision_score(y_test, y_pred_lr, average="weighted"):.2f}')
print(f'Logistic Regression Recall score:    {recall_score(y_test, y_pred_lr, average="weighted"):.2f}')
print(f'Logistic Regression F1 score:        {f1_score(y_test, y_pred_lr, average="weighted"):.2f}')

Logistic Regression Precision score: 0.75
Logistic Regression Recall score:    0.43
Logistic Regression F1 score:        0.54


In [112]:
print(f'Random Forest Precision score: {precision_score(y_test, y_pred_rf, average="weighted"):.2f}')
print(f'Random Forest Recall score:    {recall_score(y_test, y_pred_rf, average="weighted"):.2f}')
print(f'Random Forest F1 score:        {f1_score(y_test, y_pred_rf, average="weighted"):.2f}')

Random Forest Precision score: 0.79
Random Forest Recall score:    0.24
Random Forest F1 score:        0.36


# LR predictions

In [113]:
# Here's the list of new synopses
new_synopses = [
    "John and Mary move into a new house in Mary's old family house. They don't know the horror hiding behind the door..",
    "Paris is the background to the encounter between Luca and Rossana, both recently divorced.",
    "Mordrion is a planet in galaxy 43o, an alien invasion is the occasion to fight for freedom"
]

# Step 1: You don't need to vectorize here, just pass the raw text to the pipeline
# Step 2: Predict the classes for all synopses using the pipeline directly
predicted_classes = pipeline_lr.predict(new_synopses)

# Step 3: Invert the predictions back to the original labels
predicted_labels = mlb.inverse_transform(predicted_classes)

# Iterate over the predicted labels and print the result for each synopsis
for i, labels in enumerate(predicted_labels):
    print(f"Synopsis {i+1} Predicted Genres: {', '.join(labels)}")

Synopsis 1 Predicted Genres: Horror
Synopsis 2 Predicted Genres: Comedy, Drama
Synopsis 3 Predicted Genres: Action, Science Fiction


# RF predictions

In [114]:
# Here's the list of new synopses
new_synopses = [
    "John and Mary move into a new house in Mary's old family house. They don't know the horror hiding behind the door..",
    "Paris is the background to the encounter between Luca and Rossana, both recently divorced.",
    "Mordrion is a planet in galaxy 43o, an alien invasion is the occasion to fight for freedom"
]

# Step 1: You don't need to vectorize here, just pass the raw text to the pipeline
# Step 2: Predict the classes for all synopses using the pipeline directly
predicted_classes = pipeline_rf.predict(new_synopses)

# Step 3: Invert the predictions back to the original labels
predicted_labels = mlb.inverse_transform(predicted_classes)

# Iterate over the predicted labels and print the result for each synopsis
for i, labels in enumerate(predicted_labels):
    print(f"Synopsis {i+1} Predicted Genres: {', '.join(labels)}")

Synopsis 1 Predicted Genres: Horror
Synopsis 2 Predicted Genres: Drama
Synopsis 3 Predicted Genres: Science Fiction
