In [1]:
import pandas as pd

df = pd.read_csv('movies.csv', dtype='unicode')

df.dropna(subset=['Title', 'Genre', 'Language',  'imdbVotes', 'imdbRating', 'imdbID'], inplace=True)

df.head()

Unnamed: 0,_id,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,Type,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID
5,666a8cf1fb459925bc9230dd,Chinese Opium Den,1894,,17 Oct 1894,1 min,Short,William K.L. Dickson,,,...,movie,,,,,True,,,,
33,666a8cf1fb459925bc9230f9,Arrivée d'un train gare de Vincennes,1896,,,1 min,"Documentary, Short",Georges Méliès,,,...,movie,,,,,True,,,,
68,666a8cf1fb459925bc92311c,Post No Bills,1896,Not Rated,,1 min,"Short, Comedy",Georges Méliès,,,...,movie,,,,,True,,,,
173,666a8cf1fb459925bc923185,Výstavní párkar a lepic plakátù,1898,,01 Jul 1898,1 min,"Short, Comedy",Jan Krízenecký,Josef Sváb-Malostranský,"Josef Sváb-Malostranský, Ferdinand Gýra",...,movie,,,,,True,,,,
176,666a8cf1fb459925bc923188,The Burglar on the Roof,1898,Not Rated,01 Sep 1898,1 min,"Short, Crime",J. Stuart Blackton,,J. Stuart Blackton,...,movie,,,,,True,,,,


In [2]:
filter_df = df[['Title', 'Plot', 'Genre', 'Language', 'Actors', 'imdbVotes', 'imdbRating', 'imdbID']]

# Remove commas and convert 'imdbVotes' to integer
filter_df.loc[:, 'imdbVotes'] = filter_df['imdbVotes'].str.replace(',', '').astype(int)

# Convert 'imdbRating' to float
filter_df.loc[:, 'imdbRating'] = filter_df['imdbRating'].astype(float)

# Display the DataFrame
filter_df = filter_df.dropna(subset=['Plot'])

df = filter_df[filter_df['imdbVotes'] > 10000 ]

print(df.shape)

df

(4319, 8)


Unnamed: 0,Title,Plot,Genre,Language,Actors,imdbVotes,imdbRating,imdbID
416,A Trip to the Moon,An association of astronomers has convened to ...,"Short, Action, Adventure","French, English","Georges Méliès, Victor André, Bleuette Bernon",55546,8.1,tt0000417
438,The Great Train Robbery,Among the earliest existing films in American ...,"Short, Action, Adventure",English,"Gilbert M. 'Broncho Billy' Anderson, A.C. Abad...",20590,7.3,tt0000439
4964,The Birth of a Nation,"Two brothers, Phil and Ted Stoneman, visit the...","Drama, History, War",English,"Lillian Gish, Mae Marsh, Henry B. Walthall",26258,6.2,tt0004972
6855,Intolerance,Intolerance and its terrible effects are exami...,"Drama, History",English,"Lillian Gish, Robert Harron, Mae Marsh",16594,7.7,tt0006864
9953,Broken Blossoms,Cheng Huan is a missionary whose goal is to br...,"Drama, Romance",English,"Lillian Gish, Richard Barthelmess, Donald Crisp",10968,7.2,tt0009968
...,...,...,...,...,...,...,...,...
245926,Baise-moi,Manu and Nadine lose their last tenuous relati...,"Crime, Drama, Thriller",French,"Raffaëla Anderson, Karen Lancaume, Céline Beugnot",18526,4.4,tt0249380
246007,Billy Elliot,"County Durham, during the endless, violent 198...","Drama, Music",English,"Jamie Bell, Julie Walters, Jean Heywood",142566,7.7,tt0249462
246022,Domestic Disturbance,Susan Morrison is getting married to wealthy i...,"Crime, Thriller",English,"John Travolta, Nick Loren, Matt O'Leary",28680,5.6,tt0249478
246060,Foodfight!,"When the supermarket closes at night, the cont...","Animation, Action, Adventure",English,"Hilary Duff, Haylie Duff, Charlie Sheen",11723,1.3,tt0249516


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Filter the DataFrame based on imdbVotes
df = filter_df[filter_df['imdbVotes'] > 500000]

# Set the 'imdbID' column as the index
df.set_index('imdbID', inplace=True)

# Concatenate 'Plot' and 'Genre' columns
df['Plot_Genre'] = df['Plot'] + ' ' + df['Genre'] + ' ' + df['Actors'] 

# Define a TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['Plot_Genre'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def get_recommendations(imdb_id, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the IMDb ID
    idx = df.index.get_loc(imdb_id)

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df.iloc[movie_indices]

# Example: Get recommendations for a movie by IMDb ID
movie_imdb_id = "tt0071562"  # IMDb ID for "The Godfather"
recommendations = get_recommendations(movie_imdb_id)
print("Recommendations for the movie with IMDb ID", movie_imdb_id, ":")
recommendations


Recommendations for the movie with IMDb ID tt0071562 :


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Plot_Genre'] = df['Plot'] + ' ' + df['Genre'] + ' ' + df['Actors']


Unnamed: 0_level_0,Title,Plot,Genre,Language,Actors,imdbVotes,imdbRating,Plot_Genre
imdbID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0068646,The Godfather,"The Godfather ""Don"" Vito Corleone is the head ...","Crime, Drama","English, Italian, Latin","Marlon Brando, Al Pacino, James Caan",2024175,9.2,"The Godfather ""Don"" Vito Corleone is the head ..."
tt0112641,Casino,This Martin Scorsese film depicts the Janus-li...,"Crime, Drama",English,"Robert De Niro, Sharon Stone, Joe Pesci",560760,8.2,This Martin Scorsese film depicts the Janus-li...
tt0075314,Taxi Driver,Travis Bickle is an ex-Marine and Vietnam War ...,"Crime, Drama","English, Spanish","Robert De Niro, Jodie Foster, Cybill Shepherd",926206,8.2,Travis Bickle is an ex-Marine and Vietnam War ...
tt0209144,Memento,Memento chronicles two separate stories of Leo...,"Mystery, Thriller",English,"Guy Pearce, Carrie-Anne Moss, Joe Pantoliano",1322281,8.4,Memento chronicles two separate stories of Leo...
tt0099685,Goodfellas,"Henry Hill might be a small time gangster, who...","Biography, Crime, Drama","English, Italian","Robert De Niro, Ray Liotta, Joe Pesci",1259934,8.7,"Henry Hill might be a small time gangster, who..."
tt0113277,Heat,Hunters and their prey--Neil and his professio...,"Action, Crime, Drama","English, Spanish","Al Pacino, Robert De Niro, Val Kilmer",715349,8.3,Hunters and their prey--Neil and his professio...
tt0086250,Scarface,Tony Montana manages to leave Cuba during the ...,"Crime, Drama","English, Spanish","Al Pacino, Michelle Pfeiffer, Steven Bauer",915945,8.3,Tony Montana manages to leave Cuba during the ...
tt0099785,Home Alone,It is Christmas time and the McCallister famil...,"Comedy, Family","English, French","Macaulay Culkin, Joe Pesci, Daniel Stern",651142,7.7,It is Christmas time and the McCallister famil...
tt0097165,Dead Poets Society,Painfully shy Todd Anderson has been sent to t...,"Comedy, Drama","English, Latin","Robin Williams, Robert Sean Leonard, Ethan Hawke",550445,8.1,Painfully shy Todd Anderson has been sent to t...
tt0108778,Friends,"Ross Geller, Rachel Green, Monica Geller, Joey...","Comedy, Romance","English, Spanish, Italian, French, Dutch, Hebrew","Jennifer Aniston, Courteney Cox, Lisa Kudrow",1094224,8.9,"Ross Geller, Rachel Green, Monica Geller, Joey..."


In [4]:
df = filter_df[filter_df['imdbVotes'] > 500000]

df = df[['imdbID', 'Title', 'Language', 'Genre', 'imdbVotes', 'imdbRating']]

df.set_index('imdbID', inplace=True)

df['imdbRating'] = df['imdbRating'].astype(float)

df['imdbVotes'] = df['imdbVotes'].astype(int)


df

Unnamed: 0_level_0,Title,Language,Genre,imdbVotes,imdbRating
imdbID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0034583,Casablanca,"English, French, German, Italian, Russian","Drama, Romance, War",607783,8.5
tt0038650,It's a Wonderful Life,"English, French","Drama, Family, Fantasy",500879,8.6
tt0047396,Rear Window,English,"Mystery, Thriller",522060,8.5
tt0050083,12 Angry Men,English,"Crime, Drama",869326,9.0
tt0054215,Psycho,English,"Horror, Mystery, Thriller",721707,8.5
...,...,...,...,...,...
tt0240772,Ocean's Eleven,"English, Italian, Mandarin","Crime, Thriller",618831,7.7
tt0241527,Harry Potter and the Sorcerer's Stone,"English, Latin","Adventure, Family, Fantasy",860139,7.6
tt0242653,The Matrix Revolutions,"English, French","Action, Sci-Fi",545548,6.7
tt0245429,Spirited Away,"Japanese, English","Animation, Adventure, Family",854969,8.6


In [5]:

preferred_languages = ['Spanish']
preferred_genres = ['Crime', 'Drama',]


# Function to recommend top 10 movies for each genre and preferred languages
def recommend_movies_by_genre_and_language(genre, languages):
    # Filter dataframe for the given genre and preferred languages
    genre_df = df[df['Genre'].str.contains(genre) & df['Language'].apply(lambda x: any(lang in x for lang in languages))]

    # Sort by imdbVotes and imdbRating in descending order
    sorted_df = genre_df.sort_values(by=['imdbVotes', 'imdbRating'], ascending=False)

    # Select top 10 movies
    top_movies = sorted_df.head(10)
    
    return top_movies

# Generate recommendations for each genre and preferred languages
recommendations = {}
for genre in preferred_genres:
    recommendations[genre] = recommend_movies_by_genre_and_language(genre, preferred_languages)

# Print recommendations
for genre, movies in recommendations.items():
    print(f"\nTop 10 Movies in {genre} for Preferred Languages:\n")
    print(movies)


Top 10 Movies in Crime for Preferred Languages:

                           Title                                    Language  \
imdbID                                                                         
tt0110912           Pulp Fiction                    English, Spanish, French   
tt0071562  The Godfather Part II  English, Italian, Spanish, Latin, Sicilian   
tt0114814     The Usual Suspects         English, Hungarian, Spanish, French   
tt0075314            Taxi Driver                            English, Spanish   
tt0086250               Scarface                            English, Spanish   
tt0118715       The Big Lebowski            English, German, Hebrew, Spanish   
tt0144084        American Psycho                 English, Spanish, Cantonese   
tt0113277                   Heat                            English, Spanish   

                           Genre  imdbVotes  imdbRating  
imdbID                                                   
tt0110912           Crime, Drama 