In [1]:
import pandas as pd
import numpy as np

In [3]:
#load the data
movies_data= pd.read_csv('movies.csv')
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Title contains ( ) which makes it difficult to search, so with the help of regex, remove that parenthesis

In [6]:
import re

In [8]:
def title_cleaning(text):
    title=text
    cleaned_title=re.sub("[^A-Za-z0-9 ]","",title)
    return cleaned_title

The above function cleans the title name, let us create a new column for cleaned title.

In [11]:
movies_data['cleaned_title']= movies_data['title'].apply(title_cleaning)

In [13]:
movies_data.head()

Unnamed: 0,movieId,title,genres,cleaned_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


let us convert the titles into numbers using tf-idf vector

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
Tf_id= TfidfVectorizer(ngram_range=(1,2))
title_matrix=Tf_id.fit_transform(movies_data['cleaned_title'])

In [20]:
title_matrix

<62423x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 446566 stored elements in Compressed Sparse Row format>

Now to find out the similar movies based on the search, we should find similarity score. we can use cosine similarity for that.

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
def search_engine(title):
    title= title_cleaning(title)
    numerical_title=Tf_id.transform([title])
    similarity_score= cosine_similarity(numerical_title,title_matrix).flatten()
    #now we need some 5 top similar titles, we can use arg sort function which gives the indices of top 5 similar movies
    top5_movies_indices=np.argsort(similarity_score)[-5:][::-1]
    return movies_data.iloc[top5_movies_indices]
    
    
    
    

In [42]:
len(search_engine("Toy Story (1995)"))

5

Create an interactive search engine

In [34]:
import ipywidgets as widgets
from IPython.display import display




In [60]:
input_movie_name= widgets.Text(value="Avengers", description="Search for the movie",disabled=False)

movies_output= widgets.Output()

In [62]:
def on_input(text):
    with movies_output:
        movies_output.clear_output()
        output=search_engine(text['new'])
        if len(output)==5:
            display(output)
        else:
            display("Movie name not found")

input_movie_name.observe(on_input, names="value")

display(input_movie_name,movies_output)
            
          
        

Text(value='Avengers', description='Search for the movie')

Output()

In [64]:
ratings_data=pd.read_csv('ratings.csv')

In [74]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [72]:
ratings_data.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [66]:
movies_data.shape

(62423, 4)

In [68]:
ratings_data.shape

(25000095, 4)

In [187]:
movie_id=1

our motive is find out the users who liked the given movie and the other movies which are liked by 10% of them.This is first part.Now the second part is find out the movies (which are liked by similar users ) liked by all.

In [335]:
def movie_recommendation(movie_id):
    
    similar_users= ratings_data[(ratings_data['movieId']==movie_id)&(ratings_data['rating']>=4)]['userId'].unique()
    #print(similar_users)
    similar_user_records= ratings_data[(ratings_data['userId'].isin(similar_users))& (ratings_data['rating']>=4)]['movieId']
    #The below line gives the movies which are liked by more than 10% of the users who liked the given movieid
    #print(similar_user_records)
    similar_user_records=similar_user_records.value_counts()/len(similar_users)
    #print(val_c)
    #print(type(similar_user_records))
    #print(similar_user_records)
    similar_user_records=similar_user_records[similar_user_records>0.10]
    all_user_recs=ratings_data[(ratings_data['movieId'].isin(similar_user_records.index))& (ratings_data['rating']>4)]
    all_user=all_user_recs['movieId'].value_counts()/len(all_user_recs['userId'].unique())
    
    combined_data=pd.concat([similar_user_records,all_user],axis=1)
    combined_data.columns=['similar','all']
    combined_data['rec_score']= combined_data['similar']/combined_data['all']
    return combined_data.merge(movies_data,left_index=True,right_on='movieId')[["rec_score", "title", "genres"]]




    

I created this function as if given movie is rated less than 2 by every user,I still want some recommendatiions

In [337]:
def movie_recommendation_2(movie_id):
    similar_users= ratings_data[(ratings_data['movieId']==movie_id)&(ratings_data['rating']>=1)]['userId'].unique()
    print(similar_users)
    similar_user_records= ratings_data[(ratings_data['userId'].isin(similar_users))& (ratings_data['rating']>=1)]['movieId']
    #The below line gives the movies which are liked by more than 10% of the users who liked the given movieid
    #print(similar_user_records)
    similar_user_records=similar_user_records.value_counts()/len(similar_users)
    #print(val_c)
    #print(type(similar_user_records))
    #print(similar_user_records)
    similar_user_records=similar_user_records[similar_user_records>0.10]
    all_user_recs=ratings_data[(ratings_data['movieId'].isin(similar_user_records.index))& (ratings_data['rating']>4)]
    all_user=all_user_recs['movieId'].value_counts()/len(all_user_recs['userId'].unique())
    
    combined_data=pd.concat([similar_user_records,all_user],axis=1)
    combined_data.columns=['similar','all']
    combined_data['rec_score']= combined_data['similar']/combined_data['all']
    return combined_data.merge(movies_data,left_index=True,right_on='movieId')[["rec_score", "title", "genres"]]

In [339]:
import ipywidgets as widgets
from IPython.display import display 

In [343]:
movie_input_name=widgets.Text(value="Avengers",description="Please Enter the Movie Title",disabled=False)
output_list=widgets.Output()

def search(movie_name):
    with output_list:
        output_list.clear_output()
        movie_name=movie_name['new']
        #print(movie_name)
        result=search_engine(movie_name).iloc[0]['movieId']
        #print(result)
        if len(movie_recommendation(result))>1:
            display(movie_recommendation(result))
        else:
            display(movie_recommendation_2(result))


movie_input_name.observe(search,names="value")
display(movie_input_name,output_list)
        
        
        



Text(value='Avengers', description='Please Enter the Movie Title')

Output()