In [98]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#### Loading and Preprocessing Data

The code below took the original dataset of 9998 rows, removed rows with null values, droped all columns except title and overview, and took a random sample of 500 rows. Then, the dataframe was read into a new csv file so it could be easily used and uploaded to github.

In [111]:
# df = pd.read_csv('top_rated_movies.csv')
# df = df[['title', 'overview']]
# #drop rows with null values
# df.isna().sum()
# df = df.dropna(subset=['title', 'overview'])
# #remove duplicate rows
# df = df.drop_duplicates(subset='title', keep='first')
# df = df.sample(n=500, random_state=42)
# df.to_csv('data.csv', index=False)


#### Training TF-IDF Model

In [115]:
def prepare_recommender(df):
    # stop_words is a parameter that removes common english words(such as "the", "is", "and") that dont provide extra meaning when distinguishing documents 
    # the token pattern ignores digits and special characters
    # the ngram ranges means that the model accounts for both unigrams and bigrams
    #min_df = 2 means that words that appear in less than 2 documents are removed
    #max features limits the vocab size
    vectorizer = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 2),  
        max_features=5000, 
        min_df=2,
        token_pattern=r'\b[a-zA-Z]+\b'
    )
    # computes the IDF for each word(fit) and then converts each movie overview into tf-idf vectors based on this vocabulary(transform)
    # tfidf_matrix is a sparse matrix that has a row for each movie and a column for each word
    tfidf_matrix = vectorizer.fit_transform(df['overview'])

    return (vectorizer, tfidf_matrix, df['title'].tolist(), df['overview'].tolist())

In [116]:
def get_recommendations(query, vectorizer, tfidf_matrix, titles, overviews):
    # Transform the user input into the same TF-IDF space
    query_vector = vectorizer.transform([query])

    # compute the cosine similarity between user input and movie descriptions and turn into 1-D array
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    #sort and get indicies of top 5 movies
    top_indices = similarities.argsort()[-5:][::-1]

    return [(titles[idx], overviews[idx], similarities[idx]) for idx in top_indices]

#### Recommend Movies

In [117]:
vectorizer, tfidf_matrix, titles, overviews = prepare_recommender(df)

query = input('Enter descriptions of movie a you like: ')
recommendations = get_recommendations(query, vectorizer, tfidf_matrix, titles, overviews)

print(f"\nRecommendations for query: '{query}'\n")

top_5 = pd.DataFrame(recommendations, columns=["Title", "Overview", "Similarity Score"])
top_5


Recommendations for query: 'I love thrilling action movies set in space, with a comedic twist.'



Unnamed: 0,Title,Overview,Similarity Score
0,Monsters vs Aliens,"When Susan Murphy is unwittingly clobbered by a meteor full of outer space gunk on her wedding day, she mysteriously grows to 49-feet-11-inches. The military jumps into action and captures Susan, secreting her away to a covert government compound. She is renamed Ginormica and placed in confinement with a ragtag group of Monsters...",0.223936
1,Camp Rock,"When Mitchie gets a chance to attend Camp Rock, her life takes an unpredictable twist, and she learns just how important it is to be true to yourself.",0.142727
2,Jingle All the Way,"Howard Langston, a salesman for a mattress company, is constantly kept busy at his job, disappointing his son. After he misses his son's karate exposition, Howard vows to make it up to him by buying an action figure of his son's favorite television hero for Christmas. Unfortunately for Howard, it is Christmas Eve, and every store is sold out of Turbo Man. Now, Howard must travel all over town and compete with everybody else to find a Turbo Man action figure.",0.128999
3,Pokémon: The Rise of Darkrai,"Ash and friends (this time accompanied by newcomer Dawn) arrive at an idyllic village on their way to their next Pokemon contest, where chaos will soon erupt with the prophecy of two Pokemon Gods (Dialga and Palkia) and the arrival of a mysterious, seemingly deadly Pokemon named Darkrai, which has the power to distort space and time.",0.118958
4,"Kiki, Love to Love","Through five stories, the movie addresses sex and love: Paco and Ana are a marriage looking for reactivate the passion of their sexual relations, long time unsatisfied; Jose Luis tries to recover the affections of his wife Paloma, sit down on a wheelchair after an accident which has limited her mobility; Mª Candelaria and Antonio are a marriage trying by all way to be parents, but she has the trouble that no get an orgasm when make love with him; Álex try to satisfy Natalia's fantasies, while she starts to doubt if he finally will ask her in marriage; and finally, Sandra is a single woman in a permanent searching for a man to fall in love. All them love, fear, live and explore their diverse sexual paraphilias and the different sides of sexuality, trying to find the road to happiness.",0.115024
