In [134]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [135]:
df = pd.read_csv('./datasets/TMDB_movie_dataset.csv')


In [136]:
df = df.iloc[:3000]

In [137]:
df.shape

(3000, 10)

In [138]:
df.head(1)

Unnamed: 0,id,title,backdrop_path,homepage,overview,popularity,poster_path,genres,production_companies,keywords
0,565770,Blue Beetle,/1syW9SNna38rSl9fnXwc9fP7POW.jpg,https://www.dc.com/bluebeetle,Recent college grad Jaime Reyes returns home f...,2994.357,/mXLOHHc1Zeuwsl4xYKjKh2280oL.jpg,"Action, Science Fiction, Adventure","Warner Bros. Pictures, The Safran Company, DC ...","armor, superhero, family relationships, family..."


In [139]:
df.isnull().sum()

id                         0
title                      0
backdrop_path            101
homepage                1462
overview                  36
popularity                 0
poster_path                6
genres                    67
production_companies     181
keywords                 297
dtype: int64

In [140]:
df.drop(columns=['backdrop_path', 'homepage', 'poster_path'], inplace=True)

In [141]:
df.drop(columns=['popularity'], inplace=True)

In [142]:
#jst for test
# df = df[df['keywords'].notna()]
# df = df[df['production_companies'].notna()]
# df = df[df['genres'].notna()]
# df = df[df['poster_path'].notna()]
# df = df[df['overview'].notna()]

# both works same
df.dropna(inplace=True)


In [143]:
df.shape

(2554, 6)

In [144]:
df.head(1)

Unnamed: 0,id,title,overview,genres,production_companies,keywords
0,565770,Blue Beetle,Recent college grad Jaime Reyes returns home f...,"Action, Science Fiction, Adventure","Warner Bros. Pictures, The Safran Company, DC ...","armor, superhero, family relationships, family..."


In [145]:
df['tags'] = df['genres'] + " " + df['overview']

In [146]:
new = df.drop(columns=['genres','overview','production_companies','keywords'])

In [147]:
new.head(3)

Unnamed: 0,id,title,tags
0,565770,Blue Beetle,"Action, Science Fiction, Adventure Recent coll..."
1,980489,Gran Turismo,"Action, Drama, Adventure The ultimate wish-ful..."
2,754720,A Female Boss with Big Tits and Her Cherry Boy...,Drama Yuzuru is this clumsy permavirgin employ...


In [148]:
# new['tags'] = new['tags'].apply(lambda x: x.lower())
import re
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

new['tags'] = new['tags'].apply(preprocess)

In [149]:
new.head(3)

Unnamed: 0,id,title,tags
0,565770,Blue Beetle,action science fiction adventure recent colleg...
1,980489,Gran Turismo,action drama adventure the ultimate wish fulfi...
2,754720,A Female Boss with Big Tits and Her Cherry Boy...,drama yuzuru is this clumsy permavirgin employ...


In [150]:
new['tags'][0]

'action science fiction adventure recent college grad jaime reyes returns home full of aspirations for his future only to find that home is not quite as he left it as he searches to find his purpose in the world fate intervenes when jaime unexpectedly finds himself in possession of an ancient relic of alien biotechnology the scarab'

In [151]:
ps = PorterStemmer()

#this function remove same words like lover loving loved and convert to love

def stem(text):
    return ' '.join([ps.stem(word) for word in text.split()])

new['tags'] = new['tags'].apply(stem)

In [152]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vactor = tfidf.fit_transform(new['tags']).toarray()

In [153]:
vactor

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.1893775, 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [154]:
vactor.shape

(2554, 5000)

In [155]:
# cv.get_feature_names_out()

In [156]:
similer = cosine_similarity(vactor)

In [157]:
def recommend(movie):
    try:
        idx = new[new['title'] == movie].index[0]
        sim_scores = list(enumerate(similer[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top 10 most similar movies (excluding itself)
        sim_scores = sim_scores[1:11]
        
        # Get movie indices and similarity scores
        movie_indices = [i[0] for i in sim_scores]
        scores = [i[1] for i in sim_scores]
        
        # Return recommendations with scores for debugging
        recommendations = new.iloc[movie_indices][['title']]
        recommendations['similarity_score'] = scores
        return recommendations
    except IndexError:
        return "Movie not found in database"

In [159]:
recommend('The Dark Knight')

Unnamed: 0,title,similarity_score
2467,Misery,0.24826
2777,"Paul, Apostle of Christ",0.228199
2588,Abominable,0.15018
751,The Good Dinosaur,0.146282
525,Pain & Gain,0.140857
702,The Ledge,0.133216
1569,Friday the 13th Part III,0.131715
497,Dune,0.126837
2824,Jaws 3-D,0.125246
2172,The Amityville Horror,0.12443
