In [98]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#### Loading and Preprocessing Data

In [99]:
df = pd.read_csv('top_rated_movies.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,adult,genre_ids,original_language,original_title,overview,popularity,release_date,title,vote_average,vote_count
0,0,False,"[18, 80]",en,The Godfather,"Spanning the years 1945 to 1955, a chronicle of the fictional Italian-American Corleone crime family. When organized crime family patriarch, Vito Corleone barely survives an attempt on his life, his youngest son, Michael steps in to take care of the would-be killers, launching a campaign of bloody revenge.",114.774,1972-03-14,The Godfather,8.7,17855
1,1,False,"[18, 80]",en,The Shawshank Redemption,"Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.",90.925,1994-09-23,The Shawshank Redemption,8.7,23711
2,2,False,"[35, 14]",es,Cuando Sea Joven,"70-year-old Malena gets a second chance at life when she magically turns into her 22-year-old self. Now, posing as ""Maria"" to hide her true identity, she becomes the lead singer of her grandson's band and tries to recover her dream of singing, which she had to give up at some point.",29.101,2022-09-14,Cuando Sea Joven,8.6,214
3,3,False,"[18, 80]",en,The Godfather Part II,"In the continuing saga of the Corleone crime family, a young Vito Corleone grows up in Sicily and in 1910s New York. In the 1950s, Michael Corleone attempts to expand the family business into Las Vegas, Hollywood and Cuba.",54.944,1974-12-20,The Godfather Part II,8.6,10801
4,4,False,"[18, 36, 10752]",en,Schindler's List,The true story of how businessman Oskar Schindler saved over a thousand Jewish lives from the Nazis while they worked as slaves in his factory during World War II.,55.735,1993-12-15,Schindler's List,8.6,14026


In [100]:
df = df[['title', 'overview']]
df.head(5)

Unnamed: 0,title,overview
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle of the fictional Italian-American Corleone crime family. When organized crime family patriarch, Vito Corleone barely survives an attempt on his life, his youngest son, Michael steps in to take care of the would-be killers, launching a campaign of bloody revenge."
1,The Shawshank Redemption,"Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope."
2,Cuando Sea Joven,"70-year-old Malena gets a second chance at life when she magically turns into her 22-year-old self. Now, posing as ""Maria"" to hide her true identity, she becomes the lead singer of her grandson's band and tries to recover her dream of singing, which she had to give up at some point."
3,The Godfather Part II,"In the continuing saga of the Corleone crime family, a young Vito Corleone grows up in Sicily and in 1910s New York. In the 1950s, Michael Corleone attempts to expand the family business into Las Vegas, Hollywood and Cuba."
4,Schindler's List,The true story of how businessman Oskar Schindler saved over a thousand Jewish lives from the Nazis while they worked as slaves in his factory during World War II.


In [101]:
#drop rows with null values
df.isna().sum()
df = df.dropna(subset=['title', 'overview'])

In [102]:
#remove duplicate rows
df = df.drop_duplicates(subset='title', keep='first')

In [103]:
df = df.sample(n=500, random_state=42)
df.head(5)

Unnamed: 0,title,overview
3545,Heavenly Creatures,"Wealthy and precocious teenager Juliet transfers from England to New Zealand with her family, and soon befriends the quiet, brooding Pauline through their shared love of fantasy and literature. When their parents begin to suspect that their increasingly intense and obsessive bond is becoming unhealthy, the girls hatch a dark plan for those who threaten to keep them apart."
9009,The Bride,This is a story of a small town girl named Nastya. Her newly married husband Ivan drives her to his hometown to meet his family. Upon their arrival Nastya feels she made a wrong decision: she is surrounded by weird people and she sees creepy photos all around the house. Ivan disappears while his family prepares Nastya for a mysterious traditional Russian wedding ceremony. She struggles to calm herself down hoping all this will end and they will live happily ever after. Only if she stays alive...
3139,The Wife,"A wife questions her life choices as she travels to Stockholm with her husband, where he is slated to receive the Nobel Prize for Literature."
3919,Just Like Heaven,"Shortly after David Abbott moves into his new San Francisco digs, he has an unwelcome visitor on his hands: winsome Elizabeth Martinson, who asserts that the apartment is hers -- and promptly vanishes. When she starts appearing and disappearing at will, David thinks she's a ghost, while Elizabeth is convinced she's alive."
3737,Frailty,A man confesses to an FBI agent his family's story of how his religious fanatic father's visions lead to a series of murders to destroy supposed 'demons'.


In [104]:
df.to_csv('data.csv', index=False)


#### Training TF-IDF Model

In [105]:
def prepare_recommender(df):
    # stop_words is a parameter that removes common english words(such as "the", "is", "and") that dont provide extra meaning when distinguishing documents 
    # the token pattern ignores digits and special characters
    # the ngram ranges means that the model accounts for both unigrams and bigrams
    #min_df = 2 means that words that appear in less than 2 documents are removed
    #max features limits the vocab size
    vectorizer = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 2),  
        max_features=5000, 
        min_df=2,
        token_pattern=r'\b[a-zA-Z]+\b'
    )
    # computes the IDF for each word(fit) and then converts each movie overview into tf-idf vectors based on this vocabulary(transform)
    # tfidf_matrix is a sparse matrix that has a row for each movie and a column for each word
    tfidf_matrix = vectorizer.fit_transform(df['overview'])

    return (vectorizer, tfidf_matrix, df['title'].tolist(), df['overview'].tolist())

In [106]:
def get_recommendations(query, vectorizer, tfidf_matrix, titles, overviews):
    # Transform the user input into the same TF-IDF space
    query_vector = vectorizer.transform([query])

    # compute the cosine similarity between user input and movie descriptions and turn into 1-D array
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    #sort and get indicies of top 5 movies
    top_indices = similarities.argsort()[-5:][::-1]

    return [(titles[idx], overviews[idx], similarities[idx]) for idx in top_indices]

#### Recommend Movies

In [107]:
vectorizer, tfidf_matrix, titles, overviews = prepare_recommender(df)

query = input('Enter descriptions of movie a you like: ')
recommendations = get_recommendations(query, vectorizer, tfidf_matrix, titles, overviews)

print(f"\nRecommendations for query: '{query}'\n")
for title, overview, score in recommendations:
    print(f"Title: {title}")
    print(f"Overview: {overview}")
    print(f"Similarity Score: {score:.4f}")
    print("-" * 50)


Recommendations for query: 'I love thrilling action movies set in space, with a comedic twist.'

Title: Monsters vs Aliens
Overview: When Susan Murphy is unwittingly clobbered by a meteor full of outer space gunk on her wedding day, she mysteriously grows to 49-feet-11-inches. The military jumps into action and captures Susan, secreting her away to a covert government compound. She is renamed Ginormica and placed in confinement with a ragtag group of Monsters...
Similarity Score: 0.2239
--------------------------------------------------
Title: Camp Rock
Overview: When Mitchie gets a chance to attend Camp Rock, her life takes an unpredictable twist, and she learns just how important it is to be true to yourself.
Similarity Score: 0.1427
--------------------------------------------------
Title: Jingle All the Way
Overview: Howard Langston, a salesman for a mattress company, is constantly kept busy at his job, disappointing his son. After he misses his son's karate exposition, Howard vow