In [79]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.metrics import precision_recall_fscore_support

In [2]:
movies_df = pd.read_csv("wiki_movie_plots_deduped.csv")

In [68]:
plots = movies_df["Plot"].tolist()

titles = movies_df["Title"].tolist()


X_train, X_test, y_train, y_test = train_test_split(plots, titles, test_size=0.2, random_state=42)

In [71]:
def remove_the_punc_and_spaces(text):
    tmp = re.sub(r'[^\w\s]', '', text)
    tmp = re.sub(r'\s+', ' ', tmp)
    return tmp

def preprocess_text(text):
    text = text.lower()
    return remove_the_punc_and_spaces(text)

preprocessed_plots = [preprocess_text(plot) for plot in X_train]
preprocessed_titles = [preprocess_text(title) for title in y_train]

In [72]:
vectorizer_tf_idf = TfidfVectorizer()

plot_vectors = vectorizer_tf_idf.fit_transform(preprocessed_plots)
title_vectors = vectorizer_tf_idf.transform(preprocessed_titles)

In [73]:
cosine_similarity_matrix = cosine_similarity(plot_vectors, title_vectors)

In [86]:
def retrieve_movie_titles(input_plot, num_top=5):
    pre_processed_plot = preprocess_text(input_plot)
    plot_vector = vectorizer_tf_idf.transform([pre_processed_plot])

    # find the similarity between plot description and all movie titles.
    similarity_scores = cosine_similarity(plot_vector, title_vectors)[0]

    sorted_titles_and_scores = sorted(zip(titles, similarity_scores), key=lambda x: x[1], reverse=True)

    top_5_movies = [title for title, score in sorted_titles_and_scores[:num_top]]

    return top_5_movies

In [92]:
def evaluate_model(num_top=5):
    true_labels = y_test

    prediction_labels = []

    for plot in X_test:
        prediction_labels.append(retrieve_movie_titles(plot, num_top=num_top))

    correct_predictions = 0
    for true_label, predicted_labels in zip(true_labels, prediction_labels):
        if true_label in predicted_labels:
            correct_predictions += 1

    return correct_predictions / len(true_labels)


In [109]:
n = 5
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 5 similarity scores: 1.4330753797649758%


In [110]:
n = 10
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 10 similarity scores: 2.8661507595299516%


In [111]:
n = 15
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 15 similarity scores: 8.598452278589853%


In [112]:
n = 20
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 20 similarity scores: 8.598452278589853%


In [113]:
n = 30
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 30 similarity scores: 17.196904557179707%


In [114]:
n = 50
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 50 similarity scores: 20.063055316709658%


In [115]:
n = 75
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 75 similarity scores: 34.393809114359414%


In [116]:
n = 100
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 100 similarity scores: 40.126110633419316%


In [117]:
n = 120
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 120 similarity scores: 47.29148753224419%


In [118]:
n = 150
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 150 similarity scores: 55.88993981083406%


In [119]:
n = 200
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 200 similarity scores: 73.08684436801376%


In [120]:
n = 250
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 250 similarity scores: 88.85067354542848%


In [121]:
n = 300
print(f"Percentage of correct predictions based on top {n} similarity scores: {evaluate_model(n) * 100}%")

Percentage of correct predictions based on top 300 similarity scores: 97.44912582401835%
