### Name: Haris Sumra
#### Assignment: Content-Based Recommender

Importing the Dependencies

In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

%matplotlib inline
import urllib.request, urllib.parse, urllib.error
import json

Get the API key from a JSON file, stored in the same folder

In [181]:
serviceurl = 'http://www.omdbapi.com/?'
apikey = '&apikey='+omdbapi

In [182]:
with open('APIkeys.json') as f:
    keys = json.load(f)
    omdbapi = keys['OMDBapi']

This is how our JSON dataset will print

In [183]:
#Creating a function to print our JSON dataset

def print_json(json_data):
    list_keys=['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer', 
               'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Ratings', 
               'Metascore', 'imdbRating', 'imdbVotes', 'imdbID']
    print("-"*50)
    for k in list_keys:
        if k in list(json_data.keys()):
            print(f"{k}: {json_data[k]}")
    print("-"*50)

This is what will be storing in out local database, and we will use this function to create/update the local movie database with the data retreived from the web

In [184]:
#Creating the function that will take the jason data and will store that data in local database
def save_in_database(json_data):
    
    filename = input("Please enter a name for the database (extension not needed, it will be added automatically): ")
    filename = filename+'.sqlite'
    
    import sqlite3
    conn = sqlite3.connect(str(filename))
    cur=conn.cursor()
    
    title = json_data['Title']
    # Goes through the json dataset and extracts information if it is available
    if json_data['Year']!='N/A':
        year = int(json_data['Year'])
    if json_data['Runtime']!='N/A':
        runtime = int(json_data['Runtime'].split()[0])
    if json_data['Country']!='N/A':
        country = json_data['Country']
    if json_data['Metascore']!='N/A':
        metascore = float(json_data['Metascore'])
    ##if json_data['Plot']!='N/A':
      ##  plot = json_data['Plot']
    else:
        metascore=-1
    if json_data['imdbRating']!='N/A':
        imdb_rating = float(json_data['imdbRating'])
    else:
        imdb_rating=-1
    
    # SQL commands
    cur.execute('''CREATE TABLE IF NOT EXISTS MovieInfo 
    (Title TEXT, Year INTEGER, Runtime INTEGER, Country TEXT, Metascore REAL, IMDBRating REAL)''')
    
    cur.execute('SELECT Title FROM MovieInfo WHERE Title = ? ', (title,))
    row = cur.fetchone()
    
    if row is None:
        cur.execute('''INSERT INTO MovieInfo (Title, Year, Runtime, Country, Metascore, IMDBRating)
                VALUES (?,?,?,?,?,?)''', (title,year,runtime,country,metascore,imdb_rating))
    else:
        print("Record already found. No update made.")
    
    # Commits the change and close the connection to the database
    conn.commit()
    conn.close()

Function to print contents of the local database

In [185]:
#Creating a function that will print the database stored
def print_database(database):
    
    import sqlite3
    conn = sqlite3.connect(str(database))
    cur=conn.cursor()
    
    for row in cur.execute('SELECT * FROM MovieInfo'):
        print(row)
    conn.close()

Function to save the database content in an Excel file

In [186]:
#Creating a function that will save ourr database in an excel file for us to import later
def save_in_excel(filename, database):
    
    if filename.split('.')[-1]!='xls' and filename.split('.')[-1]!='xlsx':
        print ("Filename does not have correct extension. Please try again")
        return None
    
    import pandas as pd
    import sqlite3
    
    #df=pd.DataFrame(columns=['Title','Year', 'Runtime', 'Country', 'Metascore', 'IMDB_Rating'])
    
    conn = sqlite3.connect(str(database))
    #cur=conn.cursor()
    
    df=pd.read_sql_query("SELECT * FROM MovieInfo", conn)
    conn.close()
    
    df.to_excel(filename,sheet_name='Movie Info')

Function to search for information about a movie

In [187]:
#Here is a function that will take the input of the movie that we want to store or lookup, once we have looked up
## a movie, then we will ask if we want to save the movie in ourr database, and if the response is true
## then we can store the movie in "movies" database


def search_movie(title):
    if len(title) < 1 or title=='quit': 
        print("Goodbye now...")
        return None

    try:
        url = serviceurl + urllib.parse.urlencode({'t': title})+apikey
        print(f'Retrieving the data of "{title}" now... ')
        uh = urllib.request.urlopen(url)
        data = uh.read()
        json_data=json.loads(data)
        
        if json_data['Response']=='True':
            print_json(json_data)
            
            save_database_yes_no=input ('Save the movie info in a local database? Enter "yes" or "no": ').lower()
            if save_database_yes_no=='yes':
                save_in_database(json_data)
        else:
            print("Error encountered: ",json_data['Error'])
    
    except urllib.error.URLError as e:
        print(f"ERROR: {e.reason}")

Search for any movie

In [188]:
title = input('\nEnter the name of a movie (enter \'quit\' or hit ENTER to quit): ')
if len(title) < 1 or title=='quit': 
    print("Goodbye now...")
else:
    search_movie(title)


Enter the name of a movie (enter 'quit' or hit ENTER to quit): Mario
Retrieving the data of "Mario" now... 
--------------------------------------------------
Title: Mario
Year: 2018
Rated: N/A
Released: 01 Jun 2018
Runtime: 124 min
Genre: Drama, Romance, Sport
Director: Marcel Gisler
Writer: Thomas Hess, Marcel Gisler, Frederic Moriette
Actors: Max Hubacher, Aaron Altaras, Jessy Moravec
Plot: Two young football players get caught up between the politics of the game and the politics of love.
Language: Swiss German, German
Country: Switzerland
Awards: 4 wins & 4 nominations
Ratings: [{'Source': 'Internet Movie Database', 'Value': '7.4/10'}, {'Source': 'Rotten Tomatoes', 'Value': '100%'}]
Metascore: N/A
imdbRating: 7.4
imdbVotes: 4,483
imdbID: tt6999052
--------------------------------------------------
Save the movie info in a local database? Enter "yes" or "no": yes
Please enter a name for the database (extension not needed, it will be added automatically): movies


Print the content of the local database

In [189]:
print_database('movies.sqlite')

('Titanic', 1997, 194, 'United States, Mexico', 75.0, 7.9)
('Jumanji', 1995, 104, 'United States', 39.0, 7.0)
('To Kill a Mockingbird', 1962, 129, 'United States', 88.0, 8.3)
('The Whale', 2011, 85, 'Canada', 64.0, 8.1)
('Top Gun', 1986, 110, 'United States', 50.0, 6.9)
('I Came By', 2022, 110, 'United Kingdom', 57.0, 6.1)
('The Godfather', 1972, 175, 'United States', 100.0, 9.2)
('Top Gun: Maverick', 2022, 130, 'United States', 78.0, 8.4)
('The Notebook', 2004, 123, 'United States', 53.0, 7.8)
('Halloween', 1978, 91, 'United States', 87.0, 7.7)
('Mario', 2018, 124, 'Switzerland', -1.0, 7.4)


In [190]:
#title = input('\nEnter the name of a movie (enter \'quit\' or hit ENTER to quit): ')
#if len(title) < 1 or title=='quit': 
#    print("Goodbye now...")
#else:
#    search_movie(title)

Save the database content into an Excel file

In [191]:
save_in_excel('test.xlsx','movies.sqlite')

We can reade the excel file using pandas, we can we start to explore our data

In [192]:
df=pd.read_excel('test.xlsx')
df

Unnamed: 0.1,Unnamed: 0,Title,Year,Runtime,Country,Metascore,IMDBRating
0,0,Titanic,1997,194,"United States, Mexico",75,7.9
1,1,Jumanji,1995,104,United States,39,7.0
2,2,To Kill a Mockingbird,1962,129,United States,88,8.3
3,3,The Whale,2011,85,Canada,64,8.1
4,4,Top Gun,1986,110,United States,50,6.9
5,5,I Came By,2022,110,United Kingdom,57,6.1
6,6,The Godfather,1972,175,United States,100,9.2
7,7,Top Gun: Maverick,2022,130,United States,78,8.4
8,8,The Notebook,2004,123,United States,53,7.8
9,9,Halloween,1978,91,United States,87,7.7


In [193]:
df.shape

(11, 7)

In [194]:
df_01 = df.copy()
df_01.index = np.arange(1, len(df) +1)
df_01

Unnamed: 0.1,Unnamed: 0,Title,Year,Runtime,Country,Metascore,IMDBRating
1,0,Titanic,1997,194,"United States, Mexico",75,7.9
2,1,Jumanji,1995,104,United States,39,7.0
3,2,To Kill a Mockingbird,1962,129,United States,88,8.3
4,3,The Whale,2011,85,Canada,64,8.1
5,4,Top Gun,1986,110,United States,50,6.9
6,5,I Came By,2022,110,United Kingdom,57,6.1
7,6,The Godfather,1972,175,United States,100,9.2
8,7,Top Gun: Maverick,2022,130,United States,78,8.4
9,8,The Notebook,2004,123,United States,53,7.8
10,9,Halloween,1978,91,United States,87,7.7


In [237]:
df_03 = df_01.rename(columns={'Unnamed: 0' : "User"})
df_03

##df.rename(columns={"A": "a", "B": "c"})

Unnamed: 0,User,Title,Year,Runtime,Country,Metascore,IMDBRating
1,0,Titanic,1997,194,"United States, Mexico",75,7.9
2,1,Jumanji,1995,104,United States,39,7.0
3,2,To Kill a Mockingbird,1962,129,United States,88,8.3
4,3,The Whale,2011,85,Canada,64,8.1
5,4,Top Gun,1986,110,United States,50,6.9
6,5,I Came By,2022,110,United Kingdom,57,6.1
7,6,The Godfather,1972,175,United States,100,9.2
8,7,Top Gun: Maverick,2022,130,United States,78,8.4
9,8,The Notebook,2004,123,United States,53,7.8
10,9,Halloween,1978,91,United States,87,7.7


In [238]:
##df_01 = df.copy()
df_03.User = np.arange(1, len(df) +1)
df_03

Unnamed: 0,User,Title,Year,Runtime,Country,Metascore,IMDBRating
1,1,Titanic,1997,194,"United States, Mexico",75,7.9
2,2,Jumanji,1995,104,United States,39,7.0
3,3,To Kill a Mockingbird,1962,129,United States,88,8.3
4,4,The Whale,2011,85,Canada,64,8.1
5,5,Top Gun,1986,110,United States,50,6.9
6,6,I Came By,2022,110,United Kingdom,57,6.1
7,7,The Godfather,1972,175,United States,100,9.2
8,8,Top Gun: Maverick,2022,130,United States,78,8.4
9,9,The Notebook,2004,123,United States,53,7.8
10,10,Halloween,1978,91,United States,87,7.7


In [239]:
df_03["IMDBRating"]

1     7.9
2     7.0
3     8.3
4     8.1
5     6.9
6     6.1
7     9.2
8     8.4
9     7.8
10    7.7
11    7.4
Name: IMDBRating, dtype: float64

In [240]:
movies_average_rating=df_03.groupby('Title')['IMDBRating'].mean().sort_values(ascending=False).reset_index().rename(columns={'IMDBRating':'Average IMDB Rating'})
movies_average_rating.head(10)

Unnamed: 0,Title,Average IMDB Rating
0,The Godfather,9.2
1,Top Gun: Maverick,8.4
2,To Kill a Mockingbird,8.3
3,The Whale,8.1
4,Titanic,7.9
5,The Notebook,7.8
6,Halloween,7.7
7,Mario,7.4
8,Jumanji,7.0
9,Top Gun,6.9


In [241]:
df_03["Title"]

1                   Titanic
2                   Jumanji
3     To Kill a Mockingbird
4                 The Whale
5                   Top Gun
6                 I Came By
7             The Godfather
8         Top Gun: Maverick
9              The Notebook
10                Halloween
11                    Mario
Name: Title, dtype: object

### TF-IDF Model: 

In [242]:
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
##from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

Cosine Similarity: 
I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. Mathematically, it is defined as follows: cosine(x,y)=x.y⊺ / ||x||.||y||

In [243]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_03['Title'])

In [244]:
tfidf_matrix.shape

(11, 14)

Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's linear_kernel instead of cosine_similarities since it is much faster.

In [245]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [246]:
cosine_sim[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [247]:
titles = df_03['Title']
indices = pd.Series(df_03.index, index=df_03['Title'])

We now have a pairwise cosine similarity matrix for all the movies in our dataset. The next step is to write a function that returns the most similar movies based on the cosine similarity score.

In [248]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

Let's see the top recommendation for the movie that we want to select

In [249]:
get_recommendations('Titanic').head(30)

1                   Titanic
3     To Kill a Mockingbird
4                 The Whale
5                   Top Gun
6                 I Came By
7             The Godfather
8         Top Gun: Maverick
9              The Notebook
10                Halloween
11                    Mario
Name: Title, dtype: object

### Item-Item collaborative filtering

In [259]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
from lightfm.cross_validation import random_train_test_split
import scipy.sparse as sp

In [260]:
ratings = np.array(df_03['IMDBRating'])
users = np.array(df_03['User'])
items = np.array(df_03['Title'])

In [261]:
from sklearn.preprocessing import LabelEncoder

# heavy lifting encoders
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [262]:
# preparation for the csr matrix
u = user_encoder.fit_transform(users)
i = item_encoder.fit_transform(items)
lu = len(np.unique(u))
li = len(np.unique(i))

In [263]:
matrix = sp.coo_matrix((ratings, (u, i)), shape=(lu, li))

In [264]:
matrix.todense()

matrix([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 7.9, 0. , 0. , 0. ],
        [0. , 0. , 7. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 8.3, 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 8.1, 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 6.9, 0. ],
        [0. , 6.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 9.2, 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 8.4],
        [0. , 0. , 0. , 0. , 0. , 7.8, 0. , 0. , 0. , 0. , 0. ],
        [7.7, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 7.4, 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])

In [266]:
item_encoder.classes_[:15]

array(['Halloween', 'I Came By', 'Jumanji', 'Mario', 'The Godfather',
       'The Notebook', 'The Whale', 'Titanic', 'To Kill a Mockingbird',
       'Top Gun', 'Top Gun: Maverick'], dtype=object)

In [267]:
user_encoder.classes_[:15]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [268]:
from lightfm.cross_validation import random_train_test_split

train, test = random_train_test_split(matrix, test_percentage=0.2)

In [269]:
model = LightFM()
model.fit(train)

<lightfm.lightfm.LightFM at 0x7fdc557bea20>

In [270]:
from lightfm.evaluation import precision_at_k

precision_at_k(model, test, k=10).mean()

0.06666667

In [271]:
user = 1
user_id = int(user_encoder.transform([user])[0])

In [272]:
user_encoder.inverse_transform([user_id])

array([1])

In [273]:
all_movie_ids = np.array(list(range(len(item_encoder.classes_))),dtype=np.int32)
all_movie_ids.dtype

dtype('int32')

In [274]:
preds = model.predict(user_id, all_movie_ids)

In [275]:
all_movies = pd.DataFrame(
    zip(item_encoder.classes_, preds),
    columns=['Movie', 'Prediction']
).sort_values(
    'Prediction', ascending=False
)

all_movies.head(15)

Unnamed: 0,Movie,Prediction
7,Titanic,0.055487
2,Jumanji,0.052734
8,To Kill a Mockingbird,0.051984
4,The Godfather,0.050081
1,I Came By,0.049555
6,The Whale,0.049455
0,Halloween,0.049258
5,The Notebook,0.04611
3,Mario,0.023788
10,Top Gun: Maverick,0.023071


Sourrces: 
https://www.kaggle.com/code/rounakbanik/movie-recommender-systems

https://towardsdatascience.com/step-by-step-guide-to-build-your-own-mini-imdb-database-fc39af27d21b
