In [53]:
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
from sklearn import linear_model
from sentence_transformers import SentenceTransformer, util
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Create Dataset

I couldn't found a dataset with both plot description and genres with less than 500 rows; therefore, I prepocessed and merged three datasets to create a new dataset that satisfy this challenge's requirement.

In [54]:
main = pd.read_csv('data/movies_metadata.csv')
main = main.dropna()
main = main.drop_duplicates()
main = main[['id', 'original_title', 'release_date','genres', 'vote_average' ]]
main['id'] = main['id'].astype(int)

  main = pd.read_csv('data/movies_metadata.csv')


In [55]:
keywords = pd.read_csv('data/keywords.csv')
keywords = keywords.dropna()
keywords = keywords.drop_duplicates()

In [56]:
plot = pd.read_csv('data/wiki_movie_plots_deduped.csv')
plot = plot.dropna()
plot = plot.drop_duplicates()
plot['Release Year'] = plot['Release Year'].astype(str)
plot = plot[['Title','Plot', 'Release Year']]

Convert the main dataset genres column to column of lists of genres

In [57]:
import ast
genres = []
for row in list(main['genres']):
    row = ast.literal_eval(row)
    row_genre = []
    for genre in row:
        # genre = ast.literal_eval(genre)
        row_genre.append(genre['name'])
    genres.append(row_genre)

Remove rows that genres is empty.

In [58]:
main['genres'] = genres
main = main[main['genres'].apply(lambda x: x != [])]
main = main.reset_index(drop = True)


Create a Release Year column from release_date column that only contains years.

In [59]:
main['Release Year'] = main['release_date'].apply(lambda x: x.split('-')[0])

Convert the keyword dataset keywords column to column of lists of keywords

In [60]:
keyword_col = []
for row in list(keywords['keywords']):
    row = ast.literal_eval(row)
    row_keywords = []
    for keyword in row:
        row_keywords.append(keyword['name'])
    keyword_col.append(row_keywords)

Remove rows that keywords is empty.

In [61]:
keywords['keywords'] = keyword_col
keywords = keywords[keywords['keywords'].apply(lambda x: x != [])]
keywords = keywords.reset_index(drop=True)


Merge all three dataframes.

In [65]:
full_movies = pd.merge(main, keywords, on = 'id', how = 'inner')
full_movies = full_movies.drop_duplicates(subset = ['id'])
full_movies = full_movies.rename(columns = {'original_title': 'Title'})
full_movies = pd.merge(full_movies, plot, on = ['Title', 'Release Year'], how = 'inner')
full_movies.drop(columns = ['id', 'release_date'], inplace = True)

Save the created dataset to csv file.

In [None]:
full_movies.to_csv('data/full_movies.csv', index=False)

## Load Data

In [None]:
movies = pd.read_csv('data/full_movies.csv')
movies = movies.drop_duplicates()
movies 

Unnamed: 0,Title,genres,vote_average,Release Year,keywords,Plot
0,GoldenEye,"['Adventure', 'Action', 'Thriller']",6.6,1995,"['cuba', 'falsely accused', 'secret identity',...","In 1986, at Arkhangelsk, MI6 agents James Bond..."
1,Friday,['Comedy'],7.0,1995,"['rap music', 'parent child relationship', 'ra...","Craig Jones, a young man living in South Centr..."
2,From Dusk Till Dawn,"['Horror', 'Action', 'Thriller', 'Crime']",6.9,1996,"['dancing', 'brother brother relationship', 's...",Fugitive bank robbers Seth and Richie Gecko ho...
3,Blue in the Face,['Comedy'],6.8,1995,"['smoking', 'corner shop', 'cigarette', 'tobac...",The film once again centers on the Brooklyn Ci...
4,Mighty Morphin Power Rangers: The Movie,"['Action', 'Adventure', 'Science Fiction', 'Fa...",5.2,1995,"['based on tv series', 'tokusatsu', 'superhero...",The Power Rangers participate with Bulk and Sk...
...,...,...,...,...,...,...
395,Cars 3,"['Family', 'Comedy', 'Animation', 'Adventure']",6.6,2017,"['cgi', 'anthropomorphism']","Lightning McQueen, now a seven-time Piston Cup..."
396,Despicable Me 3,"['Action', 'Animation', 'Adventure', 'Family',...",6.2,2017,['minions'],Former villain Gru is now an agent for the Ant...
397,War for the Planet of the Apes,"['Drama', 'Science Fiction', 'War']",6.7,2017,"['based on novel', 'sequel', 'talking animal',...",Two years have passed[9] since the human-hatin...
398,Goon: Last of the Enforcers,['Comedy'],6.0,2017,['ice hockey'],"During a pro hockey lockout, Doug ""The Thug"" G..."


In [49]:
def preprocess_data(df):
    """
    - Convert list-type columns (genres and keywords) to strings.
    - Create a 'combined_text' column by concatenating Plot, genres, and keywords.
    """

    df['genres'] = df['genres'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
    df['keywords'] = df['keywords'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
    df['combined_text'] = df['Plot'] + " " + df['genres'] + " " + df['keywords']
    
    return df

In [50]:
def tfidf_function(text):
    """
    Convert movie text into TF-IDF vectors.
    """
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
    tfidf_matrix = vectorizer.fit_transform(text)
    return vectorizer, tfidf_matrix

In [63]:
def compute_embeddings(text_list, model):
    """
    Compute dense sentence embeddings for a list of texts using the given model.
    """
    return model.encode(text_list, convert_to_tensor=True)


In [None]:
def recommend_movies(query, vectorizer, tfidf_matrix, model, embeddings, df, weight_tfidf=0.7, weight_st=0.3, top_n=5):
    """
    Given a user query, compute a combined similarity score using both TF-IDF and Sentence Transformer embeddings.
    
    Parameters:
      - query: the user query string.
      - vectorizer, tfidf_matrix: TF-IDF vectorizer and matrix for the combined text.
      - model, embeddings: Sentence Transformer model and precomputed embeddings for the combined text.
      - df: the dataframe containing the movie data.
      - weight_tfidf: weight for the TF-IDF similarity score.
      - weight_st: weight for the Sentence Transformer similarity score.
      - top_n: number of recommendations to return.
      
    Returns:
      A dictionary containing the top recommended movies with Title, combined similarity score, and vote_average.
    """
    # Compute TF-IDF similarity
    query_tfidf = vectorizer.transform([query])
    tfidf_sim = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    
    # Compute Sentence Transformer similarity
    query = model.encode([query], convert_to_tensor=True)
    st_sim = util.cos_sim(query, embeddings)[0]
    st_sim = st_sim.cpu().numpy() if hasattr(st_sim, "cpu") else st_sim
    st_sim = st_sim.flatten()
    
    # Combine the two similarity scores using a weighted sum
    combined_sim = weight_tfidf * tfidf_sim + weight_st * st_sim
    
    # Get indices of the top_n movies
    top_indices = combined_sim.argsort()[-top_n:][::-1]


    recommendations = []
    for idx in top_indices:
        rec = {
            "Title": df.iloc[idx]['Title'],
            "combined_similarity": float(combined_sim[idx]),
            "vote_average": df.iloc[idx]['vote_average']
        }
        recommendations.append(rec)
    
    return recommendations

In [None]:
def main():
    if len(sys.argv) < 3:
        print("Usage: python recommend_combined_no_df.py <path_to_dataset> \"<user query>\"")
        sys.exit(1)
    
    dataset_path = sys.argv[1]
    user_query = sys.argv[2]
    
    # Load and preprocess the dataset
    df = pd.read_csv(dataset_path)
    df = preprocess_data(df)
    
    # Build the TF-IDF matrix for the combined text
    vectorizer, tfidf_matrix = tfidf_function(df['combined_text'])
    
    # Load a pre-trained Sentence Transformer model
    st_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Compute Sentence Transformer embeddings for each movie's combined text
    st_embeddings = compute_embeddings(df['combined_text'].tolist(), st_model)
    
    # Get combined recommendations
    recommendations = recommend_movies(user_query, vectorizer, tfidf_matrix, st_model, st_embeddings, df)
    
    # Print recommendations in a user-friendly format
    print("Top recommendations:")
    for rec in recommendations:
        print(f"Title: {rec['Title']} | Similarity: {rec['combined_similarity']:.4f} | Vote Average: {rec['vote_average']}")
    
if __name__ == "__main__":
    main()

In [78]:
user_query = 'I like fantasy movies that characters have magic.'
vectorizer, tfidf_matrix = tfidf_function(df['combined_text'])
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = compute_embeddings(df['combined_text'].tolist(), model)
recommend_movies(user_query, vectorizer, tfidf_matrix, model, embeddings, df)


[{'Title': 'Cinderella',
  'combined_similarity': 0.1630351987232602,
  'vote_average': np.float64(6.8)},
 {'Title': 'Sleuth',
  'combined_similarity': 0.1295458823442459,
  'vote_average': np.float64(6.4)},
 {'Title': 'Alice in Wonderland',
  'combined_similarity': 0.12459665912640813,
  'vote_average': np.float64(6.4)},
 {'Title': 'Inferno',
  'combined_similarity': 0.1241196278574556,
  'vote_average': np.float64(5.7)},
 {'Title': "Happily N'Ever After",
  'combined_similarity': 0.11763475195066454,
  'vote_average': np.float64(4.6)}]