In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

import json

In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv

In [None]:
df = pd.read_csv('tmdb_5000_movies.csv')
df.head()

We will look at information from the following columns:

In [None]:
df_condensed = df[['title', 'genres', 'keywords', 'overview']].copy().fillna('')

First, write a function to convert a str[list[dict]] into list[dict] using json:

In [None]:
''.join(('Science Fiction').split())

In [None]:
def convert_string_to_list(json_string):
    try:
        # Convert the JSON string to a list of dictionaries
        list_of_dicts = json.loads(json_string)
        for dic in list_of_dicts:
            dic['name'] = ''.join(dic['name'].split()) 
        return list_of_dicts
    except json.JSONDecodeError as e:
        # Handle the error if the string is not valid JSON
        print(f"Error decoding JSON: {e}")
        return None

json_string = '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
list_of_dicts = convert_string_to_list(json_string)
print(list_of_dicts)

In [None]:
df_condensed['genres'] = df['genres'].apply(lambda x: ' '.join([dictionary['name'] for dictionary in convert_string_to_list(x)]))
df_condensed['keywords'] = df['keywords'].apply(lambda x: ' '.join([dictionary['name'] for dictionary in convert_string_to_list(x)]))

In [None]:
print(len(df_condensed))
indices_with_empty_overview = df_condensed[df_condensed['overview'] == ''].index
df_condensed.drop(indices_with_empty_overview, axis=0, inplace=True)
len(df_condensed)

In [None]:
# inputs = df_condensed['title'] + '. ' + df_condensed['genres'] + '. ' + df_condensed['keywords'] + '. ' + df_condensed['overview']
inputs = df_condensed['genres'] + '. ' + df_condensed['keywords']

In [None]:
class Stemmer(object):
    def __init__(self) -> None:
        self.stemmer = PorterStemmer()
    def __call__(self, doc) -> list[str]:
        """Takes in a document and returns a list of stemmed tokens."""
        tokens = word_tokenize(doc) # equivalent to .split(), but better

        return [self.stemmer.stem(tok) for tok in tokens]

In [None]:
# tfidfvetoriser = TfidfVectorizer(stop_words='english', tokenizer=Stemmer(), strip_accents='ascii', lowercase=True)
tfidfvetoriser = TfidfVectorizer(max_features=2000)
X_train = tfidfvetoriser.fit_transform(inputs)

In [None]:
def recommender(movie_title: str, n_top: int = 5) -> list:
    # find the index in the test dataset
    ind = df_condensed[df_condensed['title']==movie_title].index
    # retrieve the description of the movie
    inputs.loc[ind] # needs to be .loc, not .iloc
    # get tfidf representation
    X_test = tfidfvetoriser.transform(inputs.loc[ind])
    # compute cos similarities to the database
    cos_similarities = cosine_similarity(X_train, X_test).reshape(-1)
    # get indices of the highest cos similarity (+1 because of the line below and +1 because of slicing)
    best_indices = np.argsort(cos_similarities)[:-(n_top+1+1):-1]
    # delete the similarity score to itself
    best_indices = best_indices[np.where(best_indices != ind.to_numpy())]
    print(cos_similarities[best_indices])
    # retrieve movie titles
    return df_condensed.iloc[best_indices]['title'].to_list()

In [None]:
recommender('Runaway Bride', 5)

In [None]:
recommender('Mortal Kombat', 5)