## **MOVIE RECOMMENDATION SYSTEM** 

### *USING K-NEAREST NEIGHBOURS*

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import difflib
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD
import warnings
! pip install fuzzywuzzy



In [2]:
#) Reading dataset
moviesList1 = pd.read_csv("movies.csv") 
# moviesList1.head() 


In [3]:
#)Reading ratings file and using only three columns
ratingOfMovie = pd.read_csv("ratings.csv", usecols = ['userId', 'movieId','rating']) 
rating1 = ratingOfMovie.head(2000000)
# rating1.head()

In [4]:
pivoted = rating1.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0) #changing the orientation of the dataframe for easy processing)
# pivoted.head() 

In [5]:
#) Importing csr matrix
from scipy.sparse import csr_matrix 

In [6]:
#) Converting dataframe into sparse matrix
sparseMovies = csr_matrix(pivoted.values) 
# sparseMovies

In [7]:
from sklearn.neighbors import NearestNeighbors #importing KNN

In [8]:
#) Brute forcing the data into the model and using cosine algorithm for finding the nearest neighbors.
knnModel = NearestNeighbors(metric = 'cosine' , algorithm = 'brute', n_neighbors = 30) 
knnModel.fit(sparseMovies)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=30)

In [9]:
#) Makes the working over the model fast
from fuzzywuzzy import process 



In [10]:
#creating a recommender system function which will recommend the movies to the user using index which are nearer to the selected movie indexes.
def recommend(movieName,dataMovies,points):
    index = process.extractOne(movieName,moviesList1['title'])[2]
    print('Movie you have selected:', moviesList1['title'][index], 'Index:', index)
    clusterDistance,indice = knnModel.kneighbors(dataMovies[index],n_neighbors=points)
    print("THE MOVIES WE RECOMMEND FOR YOU IS:")
    for i in indice:
        print(moviesList1['title'][i].where(i!=index)) 

In [14]:
%timeit
recommend('man of the year',sparseMovies,20)

Movie you have selected: Man of the Year (1995) Index: 135
THE MOVIES WE RECOMMEND FOR YOU IS:
135                                                   NaN
771                                 Cable Guy, The (1996)
89                     Journey of August King, The (1995)
818                               High School High (1996)
845           Maybe, Maybe Not (Bewegte Mann, Der) (1994)
1081                         Rebel Without a Cause (1955)
797                                         Kazaam (1996)
831                                          Flirt (1995)
1322                              Albino Alligator (1996)
857          Shadow of Angels (Schatten der Engel) (1976)
1043                                         Johns (1996)
770                   Hunchback of Notre Dame, The (1996)
622                                           Rude (1995)
832                               Big Squeeze, The (1996)
794     Rendezvous in Paris (Rendez-vous de Paris, Les...
792                                

[1;31mDocstring:[0m
Time execution of a Python statement or expression

Usage, in line mode:
  %timeit [-n<N> -r<R> [-t|-c] -q -p<P> -o] statement
or in cell mode:
  %%timeit [-n<N> -r<R> [-t|-c] -q -p<P> -o] setup_code
  code
  code...

Time execution of a Python statement or expression using the timeit
module.  This function can be used both as a line and cell magic:

- In line mode you can time a single-line statement (though multiple
  ones can be chained with using semicolons).

- In cell mode, the statement in the first line is used as setup code
  (executed but not timed) and the body of the cell is timed.  The cell
  body has access to any variables created in the setup code.

Options:
-n<N>: execute the given statement <N> times in a loop. If <N> is not
provided, <N> is determined so as to get sufficient accuracy.

-r<R>: number of repeats <R>, each consisting of <N> loops, and take the
best result.
Default: 7

-t: use time.time to measure the time, which is the default on U

In [None]:
### *USING COSINE SIMILARITY*

In [None]:
#) Importing datas
movie1=pd.read_csv("movies.csv")
movie=movie1
# print(movie.head(5))

In [None]:
#) Relevent columns for predictions
relevant_columns=['title','genres','movieId']
# print(relevant_columns)

In [None]:
for index1 in relevant_columns:
    movie[index1]=movie[index1].fillna('')

In [None]:
#) Combine columns
combine_columns=movie['title']+' '+movie['genres']
# print(combine_columns)

In [None]:
#) Converting text to feature vector
vector = TfidfVectorizer()
feature_vector=vector.fit_transform(combine_columns)

In [None]:
#) Similarity score using cosine similarity
similarity_score=cosine_similarity(feature_vector)
# print(similarity_score)

In [None]:
#) Input user 
ask_movie=input('Enter your favorite movie')

In [None]:
#) list of all movies
list_all_movies=movie['title'].tolist()

In [None]:
#) Match movies
matched_movie=difflib.get_close_matches(ask_movie,list_all_movies)
print(matched_movie)

In [None]:
precise_match=matched_movie[0]
print(precise_match)

In [None]:
#) list of similar movies
movieid = movie.index[movie.title == precise_match].values[0]
print(movieid)

In [None]:
similar_movie=list(enumerate(similarity_score[movieid]))

In [None]:
#) Sort movies according to similarity score
sort_movie=sorted(similar_movie,key=lambda x:x[1],reverse=True)


In [None]:
#) Final output
print('Suggested Movies \n')
k=1

for m in sort_movie:
    index = m[0]
    titleindex=movie[movie.index == index]['title'].values[0]
    if(k<10):
        print(k,titleindex)
        k+=1

In [None]:
### *USING MATRIX FACTORIZATION*

In [None]:
#) Importing datas
movie=pd.read_csv('movies.csv')
# print(movie.head(10))
movie_rating1=pd.read_csv('ratings.csv')
movie_rating=movie_rating1.head(1000000)

In [None]:
#) Combining the dataset on the basis of movieId
combine=pd.merge(movie,movie_rating,on='movieId')

In [None]:
#) Removing timestamp and genres column
column=['timestamp','genres']
combine=combine.drop(column,axis=1)
# print(combine.head(10))

In [None]:
combine=combine.dropna(axis=0,subset=['title'])
rate_movie = (combine.
              groupby(by = ['title'])['rating'].
              count().
              reset_index().
              rename(columns = {'rating' : 'totalRating'})
              [['title' , 'totalRating']]
             )
# rate_movie.head(10)

In [None]:
total_rating=combine.merge(rate_movie,right_on='title',left_on='title',how='left')
# print(total_rating.head(10))

In [None]:
u_rate=total_rating.drop_duplicates(['userId','title'])
# print(u_rate.head(10))

In [None]:
#) Matrix Factorization

In [None]:
#) Create matrix and fill the matrix with value 0
movie_pivot=u_rate.pivot(index='userId',columns='title',values='rating').fillna(0)
# print(movie_pivot.head(10))

In [None]:
kp=movie_pivot.values.T

In [None]:
svd=TruncatedSVD(n_components=10,random_state=18)
m=svd.fit_transform(kp)

In [None]:

warnings.filterwarnings("ignore",category=RuntimeWarning)
correct=np.corrcoef(m)

In [None]:
input_movie=movie_pivot.columns
list_movie=list(input_movie)
user_movie=list_movie.index("Toy Story (1995)")

In [None]:
#) Final output
predicted_movie=correct[user_movie]
list(input_movie[(predicted_movie >= 0.92)])