In [53]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [54]:
df = pd.read_csv('./datasets/TMDB_movie_dataset.csv')


In [55]:
df = df.iloc[:3000]

In [56]:
df.to_csv('test_df.csv', index=False)

In [57]:
df.shape

(3000, 10)

In [58]:
df.head(1)

Unnamed: 0,id,title,backdrop_path,homepage,overview,popularity,poster_path,genres,production_companies,keywords
0,565770,Blue Beetle,/1syW9SNna38rSl9fnXwc9fP7POW.jpg,https://www.dc.com/bluebeetle,Recent college grad Jaime Reyes returns home f...,2994.357,/mXLOHHc1Zeuwsl4xYKjKh2280oL.jpg,"Action, Science Fiction, Adventure","Warner Bros. Pictures, The Safran Company, DC ...","armor, superhero, family relationships, family..."


In [59]:
df.isnull().sum()

id                         0
title                      0
backdrop_path            101
homepage                1462
overview                  36
popularity                 0
poster_path                6
genres                    67
production_companies     181
keywords                 297
dtype: int64

In [60]:
df.drop(columns=['backdrop_path', 'homepage', 'poster_path'], inplace=True)

In [61]:
df.drop(columns=['popularity'], inplace=True)

In [None]:
#jst for test
df = df[df['keywords'].notna()]
df = df[df['production_companies'].notna()]
df = df[df['genres'].notna()]
df = df[df['poster_path'].notna()]
df = df[df['overview'].notna()]

# both works same
# df.dropna(inplace=True)


In [63]:
df.shape

(2554, 6)

In [64]:
df.head(1)

Unnamed: 0,id,title,overview,genres,production_companies,keywords
0,565770,Blue Beetle,Recent college grad Jaime Reyes returns home f...,"Action, Science Fiction, Adventure","Warner Bros. Pictures, The Safran Company, DC ...","armor, superhero, family relationships, family..."


In [65]:
df['tags'] = (df['genres'] + " " * 5 + # Increase genre weight
              df['keywords'] + " " * 4 + # Increase keywords weight
              df['overview'] + " " + 
              df['production_companies'])

In [66]:
new = df.drop(columns=['genres','overview','production_companies','keywords'])

In [67]:
new.head(3)

Unnamed: 0,id,title,tags
0,565770,Blue Beetle,"Action, Science Fiction, Adventure armor, ..."
1,980489,Gran Turismo,"Action, Drama, Adventure based on true sto..."
2,754720,A Female Boss with Big Tits and Her Cherry Boy...,"Drama cheating, office, big tits, unfaithf..."


In [68]:
# new['tags'] = new['tags'].apply(lambda x: x.lower())
import re
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Replace commas with spaces for better tokenization of genres and keywords
    text = text.replace(',', ' ')
    # Remove special characters but keep hyphens for compound words
    text = re.sub(r'[^a-zA-Z0-9\s-]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

new['tags'] = new['tags'].apply(preprocess)

In [69]:
new.head(3)


Unnamed: 0,id,title,tags
0,565770,Blue Beetle,action science fiction adventure armor superhe...
1,980489,Gran Turismo,action drama adventure based on true story rac...
2,754720,A Female Boss with Big Tits and Her Cherry Boy...,drama cheating office big tits unfaithful wife...


In [70]:
new['tags'][0]

'action science fiction adventure armor superhero family relationships family high tech job hunting mexican american aftercreditsstinger duringcreditsstinger immigrant family college graduate dc extended universe dceu alien technology brother sister relationship latino recent college grad jaime reyes returns home full of aspirations for his future only to find that home is not quite as he left it as he searches to find his purpose in the world fate intervenes when jaime unexpectedly finds himself in possession of an ancient relic of alien biotechnology the scarab warner bros pictures the safran company dc films'

In [71]:
ps = PorterStemmer()

#this function remove same words like lover loving loved and convert to love

def stem(text):
    return ' '.join([ps.stem(word) for word in text.split()])

new['tags'] = new['tags'].apply(stem)

In [72]:
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 3),  # Include up to trigrams
    min_df=2,  # Minimum document frequency
    max_df=0.85  # Maximum document frequency
)  # Include bigrams
vactor = tfidf.fit_transform(new['tags']).toarray()

In [73]:
vactor

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.13244604, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [74]:
vactor.shape

(2554, 5000)

In [75]:
# cv.get_feature_names_out()

In [76]:
similer = cosine_similarity(vactor)

In [77]:
def recommend(movie):
    try:
        idx = new[new['title'] == movie].index[0]
        movie_genre = df.iloc[idx]['genres'].lower()  # Get the genre of input movie
        
        sim_scores = list(enumerate(similer[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top 20 similar movies first
        sim_scores = sim_scores[1:21]
        movie_indices = [i[0] for i in sim_scores]
        
        # Get recommendations and add genre information
        recommendations = new.iloc[movie_indices][['title']].copy()
        recommendations['similarity_score'] = [i[1] for i in sim_scores]
        recommendations['genres'] = df.iloc[movie_indices]['genres']
        
        # Boost scores for movies with matching genres
        for i, row in recommendations.iterrows():
            if any(g.lower() in movie_genre for g in row['genres'].split(',')):
                recommendations.loc[i, 'similarity_score'] *= 1.2
        
        # Sort again after genre boosting and return top 10
        recommendations = recommendations.sort_values('similarity_score', ascending=False)
        return recommendations[['title', 'similarity_score']].head(10)
        
    except IndexError:
        return "Movie not found in database"

In [78]:
recommend('The Dark Knight')

Unnamed: 0,title,similarity_score
656,Shark Bait,0.50698
2109,The Requin,0.43289
476,Shark Side of the Moon,0.391614
1775,Jaws,0.384555
2824,Jaws 3-D,0.361445
2482,Deep Blue Sea,0.340304
1630,Megalodon,0.298784
2667,The Misfits,0.193291
2230,Boss Level,0.177953
2505,The Adventures of Sharkboy and Lavagirl,0.168564
