In [1]:
import pandas as pd
#%pip install sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Import necessary modules from the NLTK library for text processing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download NLTK resources for tokenization, lemmatization, and stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\johnraesly\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\johnraesly\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\johnraesly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
# Helper Functions

def clean_text(text):
    # Return an empty string if text is not a string
    if not isinstance(text, str):
        return ""
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation while retaining words and digits
    text = re.sub(r'[^\\w\\s\\d]', '', text)
    # Tokenize the text into words
    words = word_tokenize(text)
    # Define English stopwords
    stop_words = set(stopwords.words('english'))
    # Remove stopwords from the tokenized words
    words = [word for word in words if word not in stop_words]
    # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Lemmatize each word
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join the words back into a single string
    text = ' '.join(words)
    return text


# Define a function to recommend the top 5 similar movies for a given movie title
def recommend(movie_title, num_recommendations=5):
    # Find the index of the given movie in the DataFrame
    try:
        index = movies[movies['title'] == movie_title].index[0]
    except IndexError:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return
    
    # Calculate similarity scores, sort them, and get titles of the top similar movies
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector: vector[1])
    
    # Introduce randomness by shuffling the recommendations before selecting the top ones
    recommendations = distance[1:num_recommendations + 1]
    random.shuffle(recommendations)
    
    for i in recommendations:
        print(movies.iloc[i[0]].title + " : " + movies.iloc[i[0]].genre)

In [11]:
# import dataset
movies = pd.read_csv('C:/Users/johnraesly/Downloads/movies/top10K-TMDB-movies.csv')

In [None]:
print(movies.head(20))

In [12]:
print(movies.columns)

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')


In [4]:
movies = movies[['id', 'title', 'overview', 'genre']]


In [13]:
movies['tags'] = movies['overview'] + movies['genre']

In [14]:
new_data = movies.drop(columns=['overview'])


In [7]:
print(movies[0:5])

      id                        title  \
0    278     The Shawshank Redemption   
1  19404  Dilwale Dulhania Le Jayenge   
2    238                The Godfather   
3    424             Schindler's List   
4    240       The Godfather: Part II   

                                            overview                 genre  \
0  Framed in the 1940s for the double murder of h...           Drama,Crime   
1  Raj is a rich, carefree, happy-go-lucky second...  Comedy,Drama,Romance   
2  Spanning the years 1945 to 1955, a chronicle o...           Drama,Crime   
3  The true story of how businessman Oskar Schind...     Drama,History,War   
4  In the continuing saga of the Corleone crime f...           Drama,Crime   

                                                tags  
0  Framed in the 1940s for the double murder of h...  
1  Raj is a rich, carefree, happy-go-lucky second...  
2  Spanning the years 1945 to 1955, a chronicle o...  
3  The true story of how businessman Oskar Schind...  
4  In the

In [18]:
movies['tags'] = movies['tags'].fillna('')

# Apply the clean_text function to the 'tags' column of 'new_data' and store the result in 'tags_clean'
movies['tags_clean'] = movies['tags'].apply(clean_text)

In [22]:
movies['tags_clean'] = movies['tags_clean'].fillna('')
movies['vote_average'] = movies['vote_average'].fillna(0)
movies['vote_count'] = movies['vote_count'].fillna(0)
movies['genre'] = movies['genre'].fillna('')
movies['combined_features'] = ( movies['tags_clean'] + ' '+ movies['genre'] + ' ' + movies['vote_average'].astype(str) + ' ' + movies['vote_count'].astype(str) )

In [23]:
# Initialize CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Vectorize the cleaned 'tags' text
vectorized_data = cv.fit_transform(movies['combined_features']).toarray()

In [24]:
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
vectorized_data_tdif = tfidf.fit_transform(movies['tags_clean']).toarray()
similarity_tdif = cosine_similarity(vectorized_data)

In [35]:
recommend('The Godfather')

The Shawshank Redemption : Drama,Crime
To Kill a Mockingbird : Crime,Drama
Requiem for a Dream : Crime,Drama
Taxi Driver : Crime,Drama
The Godfather: Part II : Drama,Crime


In [30]:
similarity = cosine_similarity(vectorized_data)


In [None]:
# Calculate similarity scores for the third movie with all other movies, sort them, and store the result
distance = sorted(list(enumerate(similarity[4])), reverse=True, key=lambda vector: vector[1])

# Print the titles of the first five movies most similar to the third movie
for i in distance[0:10]:
    print(new_data.iloc[i[0]].title)

In [None]:
# Define a function to recommend the top 5 similar movies for a given movie title
def recommend(movies):
    # Find the index of the given movie in the DataFrame
    index = new_data[new_data['title'] == movies].index[0]
    # Calculate similarity scores, sort them, and print titles of the top 5 similar movies
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector: vector[1])
    for i in distance[1:6]:  # start from 1 to skip the movie itself
        print(new_data.iloc[i[0]].title)



In [None]:
# find if the Notebook movie is in the dataset
print(new_data[new_data['title'] == 'The Notebook'])

In [None]:
# Example usage
recommend("Dilwale Dulhania Le Jayenge")