## Importing important libraries

In [1]:
import sklearn as sk
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## Loading dataset into a Pandas DataFrame

In [2]:
data = pd.read_csv("Dataset/movies.csv")

# checking if data is loaded or not in variable(data)
data.head(2)

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
0,677179,Creed III,Drama-Action,en,After dominating the boxing world Adonis Creed...,9575.225,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,75000000.0,258000000.0,116.0,Released,You can't run from your past.,7.305,845.0,Michael B. Jordan-Tessa Thompson-Jonathan Majo...,philadelphia pennsylvania-husband wife relatio...,/vJU3rXSP9hwUuLeq8IpfsJShLOk.jpg,/5i6SjyDbDWqyun8klUuCxrlFbyw.jpg,965839-267805-943822-842942-1035806-823999-107...
1,76600,Avatar: The Way of Water,Science Fiction-Adventure-Action,en,Set more than a decade after the events of the...,9366.788,20th Century Studios-Lightstorm Entertainment,2022-12-14,350000000.0,2312336000.0,192.0,Released,Return to Pandora.,7.751,6748.0,Sam Worthington-Zoe Saldaña-Sigourney Weaver-S...,loss of loved one-dying and death-alien life-f...,/t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,/s16H6tpK2utvwDtzZ8Qy4qm5Emw.jpg,183392-111332-702432-505642-1064215-436270-874...


## Analyzing Data

In [3]:
#check the shape of data
print(data.shape)

(722946, 20)


__Observation__ \
We have about __729K movies__ in the dataset and about __20 features__ of each movies.

In [4]:
# check columns of data and its data-types present in dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722946 entries, 0 to 722945
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    722946 non-null  int64  
 1   title                 722942 non-null  object 
 2   genres                511964 non-null  object 
 3   original_language     722946 non-null  object 
 4   overview              604287 non-null  object 
 5   popularity            722946 non-null  float64
 6   production_companies  337168 non-null  object 
 7   release_date          670269 non-null  object 
 8   budget                722946 non-null  float64
 9   revenue               722946 non-null  float64
 10  runtime               688455 non-null  float64
 11  status                722946 non-null  object 
 12  tagline               108059 non-null  object 
 13  vote_average          722946 non-null  float64
 14  vote_count            722946 non-null  float64
 15  

In [5]:
# check null value in dataset
data.isnull().sum()

id                           0
title                        4
genres                  210982
original_language            0
overview                118659
popularity                   0
production_companies    385778
release_date             52677
budget                       0
revenue                      0
runtime                  34491
status                       0
tagline                 614887
vote_average                 0
vote_count                   0
credits                 225164
keywords                512716
poster_path             185399
backdrop_path           500534
recommendations         688256
dtype: int64

In [6]:
# check for duplicate values
data.duplicated().sum()

1

## Cleaning Data

In [7]:
# dropping unnecesary title 
df = data.drop(["production_companies", "popularity", "budget", "revenue", "status", "recommendations", "runtime", "vote_average", "backdrop_path", "tagline"], axis=1)

In [8]:
# droping duplicate values
df.drop_duplicates(inplace=True)

In [9]:
# checking duplicates in title
df.title.duplicated().sum()

86981

In [10]:
# check if duplicates titles have same release date 
df[["title", "release_date"]].duplicated().sum()

2267

In [11]:
# get rid of duplicates with same release datae
df.drop_duplicates(subset=["title","release_date"], inplace=True)

In [12]:
# get rid of vote_count lower than 350 and reseting index
df = df[df.vote_count >= 150].reset_index()

In [13]:
df.isnull().sum()

index                   0
id                      0
title                   0
genres                  3
original_language       0
overview               15
release_date            0
vote_count              0
credits                24
keywords             1001
poster_path             0
dtype: int64

In [14]:
# replacing all the null value from genres adn overview with "nothing"
df.fillna("", inplace = True)

In [15]:
# delete movies with no genres and overview
index = df[(df.genres == "") & (df.overview == "")].index
df.drop(index, inplace=True)

In [16]:
# replacing genres, credits and keywords - with " "
df.genres = df.genres.apply(lambda x: " ".join(x.split("-")))
df.keywords = df.keywords.apply(lambda x: " ".join(x.split("-")))
df.credits = df.credits.apply(lambda x: " ".join(x.replace(" ", "").split("-")[:5]))

To predict similar movies using natural language processing techniques, we will utilize text-based data as input for our machine learning model. To facilitate this process, we will create a new column called "Tags" that encompasses all the crucial text features such as __Overview, Genres, Keywords, and Original Language__. This will enable us to make accurate predictions of similar movies.

In [17]:
# making tags for prediction
df["tags"] =df.overview + " "+ df.genres + " "  +df.credits + " " +df.keywords + " " + df.original_language

In [18]:
# making new framework with important features
new_df = df[["id", "title", "tags", 'poster_path']]

In [19]:
# making all the content of tags in lower case letter for better processing
new_df.tags = new_df.tags.apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.tags = new_df.tags.apply(lambda x:x.lower())


In [20]:
new_df.tags[0]

'after dominating the boxing world adonis creed has been thriving in both his career and family life. when a childhood friend and former boxing prodigy damien anderson resurfaces after serving a long sentence in prison he is eager to prove that he deserves his shot in the ring. the face-off between former friends is more than just a fight. to settle the score adonis must put his future on the line to battle damien — a fighter who has nothing to lose. drama action michaelb.jordan tessathompson jonathanmajors woodharris phyliciarashād philadelphia pennsylvania husband wife relationship deaf sports sequel orphan former best friend ex con childhood friends juvenile detention center boxing prodigy en'

In [21]:
new_df.head(1)

Unnamed: 0,id,title,tags,poster_path
0,677179,Creed III,after dominating the boxing world adonis creed...,/vJU3rXSP9hwUuLeq8IpfsJShLOk.jpg


## Vectorizer - Stemming

In [22]:
#import nltk library and porter module
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [23]:
# stem function to take text and give output
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)    

In [24]:
# applying stem function in tags
new_df["tags"] = new_df["tags"].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(stem)


## Text Vectorization

In [25]:
# importing countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
# setting for 5000 most repeated words, and exclude stop words
cv = CountVectorizer(stop_words="english",max_features=5000)

In [26]:
# fiting tags in count vector
vectors = cv.fit_transform(new_df["tags"]).toarray() #change it into array to use

In [27]:
cv.get_feature_names()[80:85]



['acquaint', 'act', 'action', 'activ', 'activist']

## Model Building

In [28]:
# Similarity vector with cosine 
from sklearn.metrics.pairwise import cosine_similarity
# calculating similarity of each movie with all movies
similarity = cosine_similarity(vectors)

In [29]:
# similarity of each movie with all the movies
similarity.shape

(13034, 13034)

## Testing Model

In [30]:
# making function to find movie and give similar movies as return
def recommend(movies):
    movie_index = new_df[new_df.title == movies].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key= lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

    

In [31]:
# checking similar movies test 1
recommend("Batman")

Batman: Mask of the Phantasm
The Dark Knight
Batman & Robin
Batman Returns
Dick Tracy


In [32]:
# test 2
recommend("Black Adam")

Zack Snyder's Justice League
Justice League
X-Men: Apocalypse
Superman vs. The Elite
Krrish


__Observation__ \
the function is returning similar 5 movies which means our Model building is Sucessfull

## Deployment

In [33]:
# import Pickle
import pickle 

In [34]:
# making portable movie pickle file to transport 
pickle.dump(new_df, open("movies.pkl", "wb"))

In [35]:
# making portable similarity file to transport
pickle.dump(similarity, open("similarity.pkl", "wb"))

## Deployed on Streamlit using the below code

In [36]:
'''
# first import streamlit and pickle
import streamlit as st
import pickle

# extract the new_df dataframe from movies.pkl
movies_list = pickle.load(open("movies.pkl", "rb"))
# extract the titles of movies
movies_list_title = movies_list["title"].values

# extract the similarity which contain our cosine similarity values
similarity = pickle.load(open("similarity.pkl", "rb"))


# make a recommend function which will take movie title and return 5 similar movies with their posters
def recommend(movie):
    movie_index = movies_list[movies_list["title"] == movie].index[0]
    distances = similarity[movie_index]
    sorted_movie_list = sorted(list(enumerate(distances)), reverse=True,
                               key=lambda x: x[1])[1:6]

    recommended_movies = []
    recommended_posters = []
    for i in sorted_movie_list:
        poster_path = movies_list["poster_path"][i[0]]
        recommended_movies.append(movies_list.iloc[i[0]].title)
        recommended_posters.append(
            "https://image.tmdb.org/t/p/original"+poster_path)

    return recommended_movies,  recommended_posters

# Create title for your stream lit page
st.set_page_config(page_title="Movie Recommendation System", page_icon=":clapper:")
st.title("Movie Recommendation System")


def add_bg_from_url():
    st.markdown(
        f"""
         <style>
         .stApp {{
             background-image: url("https://images.unsplash.com/photo-1489599849927-2ee91cede3ba?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2070&q=80");
             background-attachment: fixed;
             background-size: cover
         }}

        [data-testid="stHeader"] {{
        background: rgba(0,0,0,0);
        }}
         </style>
         """,
        unsafe_allow_html=True
    )


add_bg_from_url()

# Create a input box for movies name
selected_movie_name = st.selectbox(
    "What is the movie name?",
    movies_list_title
)

# create a recommend button with function of displaying recommended movies and movie posters
if st.button("Recommend"):
    recommendation, movie_posters = recommend(selected_movie_name)

    cols = st.columns(5)
    for i in range(5):
        with cols[i]:
            st.write(recommendation[i])
            st.image(movie_posters[i])

'''

'\n# first import streamlit and pickle\nimport streamlit as st\nimport pickle\n\n# extract the new_df dataframe from movies.pkl\nmovies_list = pickle.load(open("movies.pkl", "rb"))\n# extract the titles of movies\nmovies_list_title = movies_list["title"].values\n\n# extract the similarity which contain our cosine similarity values\nsimilarity = pickle.load(open("similarity.pkl", "rb"))\n\n\n# make a recommend function which will take movie title and return 5 similar movies with their posters\ndef recommend(movie):\n    movie_index = movies_list[movies_list["title"] == movie].index[0]\n    distances = similarity[movie_index]\n    sorted_movie_list = sorted(list(enumerate(distances)), reverse=True,\n                               key=lambda x: x[1])[1:6]\n\n    recommended_movies = []\n    recommended_posters = []\n    for i in sorted_movie_list:\n        poster_path = movies_list["poster_path"][i[0]]\n        recommended_movies.append(movies_list.iloc[i[0]].title)\n        recommended_p