In [3]:
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
ff = pd.read_csv(r'C:\Users\Laptop\PycharmProjects\DataScienceProject2\data sets\final_data.csv')
ff

Unnamed: 0,cast,genres,id,original_title,plot
0,"['Sam Worthington', 'Zoe Saldana', 'Sigourney ...","['Action', 'Adventure', 'Fantasy']",19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...","['Adventure', 'Fantasy', 'Action']",285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,"['Daniel Craig', 'Christoph Waltz', 'Léa Seydo...","['Action', 'Adventure', 'Crime']",206647,Spectre,A cryptic message from Bond’s past sends him o...
3,"['Christian Bale', 'Michael Caine', 'Gary Oldm...","['Action', 'Crime', 'Drama']",49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,"['Taylor Kitsch', 'Lynn Collins', 'Samantha Mo...","['Action', 'Adventure', 'Science Fiction']",49529,John Carter,"John Carter is a war-weary, former military ca..."
...,...,...,...,...,...
6472,"['Hugh Jackman', 'Patrick Stewart', 'Dafne Keen']","['Action', 'Drama', 'Sci-Fi']",,Logan,"In a future where mutants are nearly extinct, ..."
6473,"['Benedict Cumberbatch', 'Chiwetel Ejiofor', '...","['Action', 'Adventure', 'Fantasy']",,Doctor Strange,While on a journey of physical and spiritual h...
6474,"['Manoj Bajpayee', 'Tigmanshu Dhulia', 'Richa ...","['Action', 'Thriller', 'Crime']",,Gangs of Wasseypur - Part 1,Shahid Khan is exiled after impersonating the ...
6475,"['Nawazuddin Siddiqui', 'Tigmanshu Dhulia', 'H...","['Action', 'Thriller', 'Crime']",,Gangs of Wasseypur - Part 2,Sardar Khan’s sons are at war with Ramadhir Si...


In [4]:
'''
(I) get_data()
get_data() is used to fetch the data about the movies and 
return the dataset with it's attributes as the result for further preprocessing.
''' 
def get_data():
    movie_data = pd.read_csv(r'C:\Users\Laptop\PycharmProjects\DataScienceProject2\data sets\final_data.csv')
    ##Convert the title of all the movies to lowercase letters.
    movie_data['original_title'] = movie_data['original_title'].str.lower()
    ##Return the dataset as the function’s result.
    return movie_data


In [5]:

def combine_data(data):
    ##Drop the attributes not required for feature extraction.
    data_recommend = data.drop(columns=['id', 'original_title','plot'])
    ##Combine the two columns cast and genres into one single column.
    data_recommend['combine'] = data_recommend[data_recommend.columns[0:2]].apply(
                                                                        lambda x: ','.join(x.dropna().astype(str)),axis=1)
    #We have a combined column with cast and genres values present in it 
    # so we remove the cast and genres columns existing separately from our dataset.
    data_recommend = data_recommend.drop(columns=[ 'cast','genres'])
    return data_recommend

In [6]:
'''
iii) transform_data()
transform_data() takes the value returned by combine_data()
and the plot column from get_data() and 
applies CountVectorizer and TfidfVectorizer respectively and calculates the Cosine values.
'''
def transform_data(data_combine, data_plot):
    ##Make an object for CountVectorizer and initiate to remove English stopwords using the stop_words parameter.
    count = CountVectorizer(stop_words='english')
    #Fit the CountVectorizer object count onto the value returned by combine_data()
    # combined column values of cast and genres. After this, we get a sparse matrix
    count_matrix = count.fit_transform(data_combine['combine'])
    # Make an object for TfidfVectorizer and initiate to remove English stopwords using the stop_words parameter.
    tfidf = TfidfVectorizer(stop_words='english')
    #Fit the TfidfVectorizer object tfdif onto the column plot that we get from get_data(). After this, we get a sparse matrix 
    tfidf_matrix = tfidf.fit_transform(data_plot['plot'])
    #We combine the two sparse matrices we get by CountVectorizer and TfidfVectorizer into a single sparse matrix.
    combine_sparse = sp.hstack([count_matrix, tfidf_matrix], format='csr')
    #We now apply Cosine Similarity on our combined sparse matrix.
    cosine_sim = cosine_similarity(combine_sparse, combine_sparse)
    # Return the cosine similarity matrix generated 
    return cosine_sim

In [7]:
'''
title : Name of the movie
data : Return value of get_data()
combine : Return value of combine_data()
transform : Return value of transform_data()
'''
def recommend_movies(title, data, combine, transform):
    #Create a Pandas Series with indices of all the movies present in our dataset.
    indices = pd.Series(data.index, index = data['original_title'])
    #Get the index of the input movie that is passed onto our recommend_movies() function in the title parameter.
    index = indices[title]
    #Here we store the Cosine Values of each movie with respect to our input movie.
    sim_scores = list(enumerate(transform[index]))
    #After getting the cosine values we sort them in reverse order.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #We need the top 20 movies with respect to our input movie.
    sim_scores = sim_scores[1:21]
    #In these lines, we store the movie indices with their respective columns.
    movie_indices = [i[0] for i in sim_scores]
    #We create a Pandas DataFrame with Movie_Id, Name, Genres as the columns.
    
    #We store all the 20 movies similar to our input movie
    movie_id = data['id'].iloc[movie_indices]
    movie_title = data['original_title'].iloc[movie_indices]
    movie_genres = data['genres'].iloc[movie_indices]
    
    #Return the Pandas DataFrame with the top 20 movie recommendations.
    recommendation_data = pd.DataFrame(columns=['Id','Name','Genres'])

    recommendation_data['Id'] = movie_id
    recommendation_data['Name'] = movie_title
    recommendation_data['Genres'] = movie_genres

    return recommendation_data

In [8]:
# (V) results()
# result() takes a movie’s title as input and returns the top 20 recommendations.
def results(movie_name):
    '''
    convert the movie_name to lower case as all the movies is in lower case in our dataset.
    We do this as a precautionary measure. If a user types a movie name in lower case and upper case letters together then 
    it won't be a problem as our function will still return the results.
    '''
    movie_name = movie_name.lower()
    '''
    We store the values returned by get_data(), combine_data() and transform_data().
    '''
    find_movie = get_data()
    combine_result = combine_data(find_movie)
    transform_result = transform_data(combine_result,find_movie)
    '''
    Check whether the input movie is present in our dataset.
    If not found in our dataset then we return that the movie is not found.
    '''
    
    '''
    If our movie is present in the dataset then we call our recommend_movies() function 
    and pass the return values of get_data(), combine_data() and transform_data() along with the movie name 
    as the function's parameter.
    '''
    if movie_name not in find_movie['original_title'].unique():
        return 'Movie not in Database'
    
    else:
        recommendations = recommend_movies(movie_name, find_movie, combine_result, transform_result)
        return recommendations.to_dict('records')


In [10]:
name = input('What is the name of movie?\n') 
results(name)

[{'Id': '272',
  'Name': 'batman begins',
  'Genres': "['Action', 'Crime', 'Drama']"},
 {'Id': '155',
  'Name': 'the dark knight',
  'Genres': "['Drama', 'Action', 'Crime']"},
 {'Id': '1124',
  'Name': 'the prestige',
  'Genres': "['Drama', 'Mystery', 'Thriller']"},
 {'Id': '2088',
  'Name': 'romeo is bleeding',
  'Genres': "['Action', 'Crime', 'Drama']"},
 {'Id': '378237',
  'Name': "amidst the devil's wings",
  'Genres': "['Drama', 'Action', 'Crime']"},
 {'Id': '168672', 'Name': 'american hustle', 'Genres': "['Drama', 'Crime']"},
 {'Id': '479', 'Name': 'shaft', 'Genres': "['Action', 'Adventure', 'Crime']"},
 {'Id': '1493',
  'Name': 'miss congeniality',
  'Genres': "['Comedy', 'Crime', 'Action']"},
 {'Id': '97020', 'Name': 'robocop', 'Genres': "['Action', 'Science Fiction']"},
 {'Id': '147441',
  'Name': 'exodus: gods and kings',
  'Genres': "['Adventure', 'Drama', 'Action']"},
 {'Id': '11322',
  'Name': 'public enemies',
  'Genres': "['History', 'Crime', 'Drama']"},
 {'Id': '1359',


In [11]:
import pickle
pickle.dump(final_data,open('movie_list.pkl','wb'))


In [16]:
find_movie = get_data()
combine_result = combine_data(find_movie)
transform_result = transform_data(combine_result,find_movie)
pickle.dump(transform_result,open('similarity.pkl','wb'))

In [20]:
pickle.dump(final_data.to_dict(),open('movie_dic.pkl','wb'))

In [18]:
final_data['title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [19]:
final_data.to_dict()

{'movie_id': {0: 19995,
  1: 285,
  2: 206647,
  3: 49026,
  4: 49529,
  5: 559,
  6: 38757,
  7: 99861,
  8: 767,
  9: 209112,
  10: 1452,
  11: 10764,
  12: 58,
  13: 57201,
  14: 49521,
  15: 2454,
  16: 24428,
  17: 1865,
  18: 41154,
  19: 122917,
  20: 1930,
  21: 20662,
  22: 57158,
  23: 2268,
  24: 254,
  25: 597,
  26: 271110,
  27: 44833,
  28: 135397,
  29: 37724,
  30: 558,
  31: 68721,
  32: 12155,
  33: 36668,
  34: 62211,
  35: 8373,
  36: 91314,
  37: 68728,
  38: 102382,
  39: 20526,
  40: 49013,
  41: 44912,
  42: 10193,
  43: 534,
  44: 168259,
  45: 72190,
  46: 127585,
  47: 54138,
  48: 81005,
  49: 64682,
  50: 9543,
  51: 68726,
  52: 38356,
  53: 217,
  54: 105864,
  55: 62177,
  56: 188927,
  57: 10681,
  58: 5174,
  59: 14161,
  60: 17979,
  61: 76757,
  62: 258489,
  63: 411,
  64: 246655,
  65: 155,
  66: 14160,
  67: 15512,
  68: 1726,
  69: 44826,
  70: 8487,
  71: 1735,
  72: 297761,
  73: 2698,
  74: 137113,
  75: 9804,
  76: 14869,
  77: 150540,
  78: