In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import difflib
import pickle
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from surprise import Reader, Dataset, SVD 
from surprise.model_selection import cross_validate
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [13]:
data = pd.read_csv("C:/Users/AL RHO/Desktop/Old/DS_Practice/Recommendation-Project/data/cleaned_data.csv", delimiter=',')
data.head(10)

Unnamed: 0,MovieID,Title,Tags
0,4574334,Stranger Things,"\r\nWhen a young boy disappears, his mother, a..."
1,10648342,Thor: Love and Thunder,"\r\nThor enlists the help of Valkyrie, Korg an..."
2,1190634,The Boys,\r\nA group of vigilantes set out to take down...
3,1312171,The Umbrella Academy,"\r\nA family of former child heroes, now grown..."
4,5113044,Minions: The Rise of Gru,\r\nThe untold story of one twelve-year-old's ...
5,475784,Westworld,\r\nAt the intersection of the near future and...
6,9419884,Doctor Strange in the Multiverse of Madness,\r\nDoctor Strange teams up with a mysterious ...
7,8041270,Jurassic World Dominion,\r\nFour years after the destruction of Isla N...
8,12327578,Star Trek: Strange New Worlds,\r\nA prequel to Star Trek: The Original Serie...
9,6710474,Everything Everywhere All at Once,\r\nAn aging Chinese immigrant is swept up in ...


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3577 entries, 0 to 3576
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3577 non-null   int64 
 1   Title    3577 non-null   object
 2   Tags     3577 non-null   object
dtypes: int64(1), object(2)
memory usage: 84.0+ KB


In [15]:
data['MovieID'] = data['MovieID'].astype('object')

In [16]:
#Function to check for incomplete movie id
def movie_id():
    ids = []
    for i in range(len(data['MovieID'])):
        if len(str(data['MovieID'].iloc[i])) == 6:
            ids.append('0') 
        elif len(str(data['MovieID'].iloc[i])) == 5:
            ids.append('00')
        else:
            ids.append('')
    return ids

In [17]:
#Creating standard imdbid column
data['ids'] = movie_id()
data['id'] = data['ids'] + data['MovieID'].astype(str)
#Dropping unwanted columns
data.drop(['MovieID', 'ids'], axis=1, inplace=True)
data.head(15)

Unnamed: 0,Title,Tags,id
0,Stranger Things,"\r\nWhen a young boy disappears, his mother, a...",4574334
1,Thor: Love and Thunder,"\r\nThor enlists the help of Valkyrie, Korg an...",10648342
2,The Boys,\r\nA group of vigilantes set out to take down...,1190634
3,The Umbrella Academy,"\r\nA family of former child heroes, now grown...",1312171
4,Minions: The Rise of Gru,\r\nThe untold story of one twelve-year-old's ...,5113044
5,Westworld,\r\nAt the intersection of the near future and...,475784
6,Doctor Strange in the Multiverse of Madness,\r\nDoctor Strange teams up with a mysterious ...,9419884
7,Jurassic World Dominion,\r\nFour years after the destruction of Isla N...,8041270
8,Star Trek: Strange New Worlds,\r\nA prequel to Star Trek: The Original Serie...,12327578
9,Everything Everywhere All at Once,\r\nAn aging Chinese immigrant is swept up in ...,6710474


In [18]:
# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting into lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [19]:
data['Tags'] = data['Tags'].apply(_removeNonAscii)
data['Tags'] = data['Tags'].apply(make_lower_case)
data['Tags'] = data['Tags'].apply(remove_stop_words)
data['Tags'] = data['Tags'].apply(remove_punctuation)
data['Tags'] = data['Tags'].apply(remove_html)

In [20]:
#Creating the vectors from the feature
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
vector = vectorizer.fit_transform(data['Tags'])
#Getting similarity
#similarity = cosine_similarity(vector)
similarity = linear_kernel(vector, vector)
print(similarity.shape)

(3577, 3577)


In [22]:
#Creating a list of the movies
movies = data['Title'].tolist()

#Finding close matches for each movie
def get_index():
    movie = input("Enter the name of the movie: ")
    matches = difflib.get_close_matches(movie, movies)
    indices = pd.Series(data.index, index=data['Title'])
    close_match = matches[0]
    movie_index = data[data['Title'] == close_match].index.values[0]
    return movie_index

In [23]:
#Getting similar movies to the user input
def similar_movies(data):
    similar_movie = sorted(list(enumerate(similarity[get_index()])), reverse=True, key=lambda x: x[1])[1:6]
    for i in similar_movie:
        print(data.iloc[i[0]].Title)  
    #print(similar_movies)

In [24]:
similar_movies(data)

Enter the name of the movie: Spider-Man
The Amazing Spider-Man
Spider-Man: The Animated Series
Spider-Man: Homecoming
Superhero Movie
Ben 10: Ultimate Alien


In [25]:
reader = Reader()
movie = pd.read_csv("C:/Users/AL RHO/Desktop/Old/DS_Practice/Recommendation-Project/data/svd_data.csv", delimiter=',')
movie.head()

Unnamed: 0,Title,MovieID,Overview,YearOfRelease,Runtime,Genre,Rating,Votes,Certificate,Images,Tags
0,Stranger Things,4574334,"\r\nWhen a young boy disappears, his mother, a...",2016,51,"\r\nDrama, Fantasy, Horror",8.7,1086598,TV-14,https://m.media-amazon.com/images/M/MV5BMDZkYm...,"\r\nWhen a young boy disappears, his mother, a..."
1,Thor: Love and Thunder,10648342,"\r\nThor enlists the help of Valkyrie, Korg an...",2022,118,"\r\nAction, Adventure, Comedy",6.9,86889,PG-13,https://m.media-amazon.com/images/M/MV5BYmMxZW...,"\r\nThor enlists the help of Valkyrie, Korg an..."
2,The Boys,1190634,\r\nA group of vigilantes set out to take down...,2019,60,"\r\nAction, Crime, Drama",8.7,414076,TV-MA,https://m.media-amazon.com/images/S/sash/4Fyxw...,\r\nA group of vigilantes set out to take down...
3,The Umbrella Academy,1312171,"\r\nA family of former child heroes, now grown...",2019,60,"\r\nAction, Adventure, Comedy",8.0,225040,TV-14,https://m.media-amazon.com/images/S/sash/4Fyxw...,"\r\nA family of former child heroes, now grown..."
4,Minions: The Rise of Gru,5113044,\r\nThe untold story of one twelve-year-old's ...,2022,87,"\r\nAnimation, Adventure, Comedy",7.0,17360,PG,https://m.media-amazon.com/images/S/sash/4Fyxw...,\r\nThe untold story of one twelve-year-old's ...


In [26]:
df = Dataset.load_from_df(movie[['MovieID', 'Rating', 'Runtime']], reader)
svd = SVD()
# Run 5-fold cross-validation and then print results
cross_validate(svd, df, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    113.4507111.227498.6844 109.8038101.6447106.96225.7475  
MAE (testset)     93.2263 89.9525 86.7552 88.7776 88.4406 89.4304 2.1561  
Fit time          0.20    0.20    0.19    0.19    0.19    0.20    0.00    
Test time         0.01    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([113.45073747, 111.22743321,  98.68436652, 109.80375374,
        101.64466421]),
 'test_mae': array([93.22625698, 89.95251397, 86.75524476, 88.77762238, 88.44055944]),
 'fit_time': (0.1990034580230713,
  0.19698500633239746,
  0.1940147876739502,
  0.1919996738433838,
  0.19499993324279785),
 'test_time': (0.0050144195556640625,
  0.0030205249786376953,
  0.004001617431640625,
  0.003985404968261719,
  0.003000497817993164)}

In [27]:
data['Title'].values

array(['Stranger Things', 'Thor: Love and Thunder', 'The Boys', ...,
       'Profilage', 'The Doctor Blake Mysteries',
       'Sister Boniface Mysteries'], dtype=object)

In [446]:
pickle.dump(data.to_dict(), open('movie_dict.pkl', 'wb'))

In [447]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))