In [136]:
import urllib.request, urllib.parse, urllib.error
import json

In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

%matplotlib inline

In [138]:
serviceurl = 'http://www.omdbapi.com/?'
apikey = '&apikey='+omdbapi

In [139]:
with open('APIkeys.json') as f:
    keys = json.load(f)
    omdbapi = keys['OMDBapi']

In [140]:
def print_json(json_data):
    list_keys=['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer', 
               'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Ratings', 
               'Metascore', 'imdbRating', 'imdbVotes', 'imdbID']
    print("-"*50)
    for k in list_keys:
        if k in list(json_data.keys()):
            print(f"{k}: {json_data[k]}")
    print("-"*50)

In [141]:
def save_poster(json_data):
    import os
    title = json_data['Title']
    poster_url = json_data['Poster']
    # Splits the poster url by '.' and picks up the last string as file extension
    poster_file_extension=poster_url.split('.')[-1]
    # Reads the image file from web
    poster_data = urllib.request.urlopen(poster_url).read()
        
    savelocation=os.getcwd()+'\\'+'Posters'+'\\'
    # Creates new directory if the directory does not exist. Otherwise, just use the existing path.
    if not os.path.isdir(savelocation):
        os.mkdir(savelocation)
    
    filename=savelocation+str(title)+'.'+poster_file_extension
    f=open(filename,'wb')
    f.write(poster_data)
    f.close()

In [142]:
def save_in_database(json_data):
    
    filename = input("Please enter a name for the database (extension not needed, it will be added automatically): ")
    filename = filename+'.sqlite'
    
    import sqlite3
    conn = sqlite3.connect(str(filename))
    cur=conn.cursor()
    
    title = json_data['Title']
    # Goes through the json dataset and extracts information if it is available
    if json_data['Year']!='N/A':
        year = int(json_data['Year'])
    if json_data['Runtime']!='N/A':
        runtime = int(json_data['Runtime'].split()[0])
    if json_data['Country']!='N/A':
        country = json_data['Country']
    if json_data['Metascore']!='N/A':
        metascore = float(json_data['Metascore'])
    ##if json_data['Plot']!='N/A':
      ##  plot = json_data['Plot']
    else:
        metascore=-1
    if json_data['imdbRating']!='N/A':
        imdb_rating = float(json_data['imdbRating'])
    else:
        imdb_rating=-1
    
    # SQL commands
    cur.execute('''CREATE TABLE IF NOT EXISTS MovieInfo 
    (Title TEXT, Year INTEGER, Runtime INTEGER, Country TEXT, Metascore REAL, IMDBRating REAL)''')
    
    cur.execute('SELECT Title FROM MovieInfo WHERE Title = ? ', (title,))
    row = cur.fetchone()
    
    if row is None:
        cur.execute('''INSERT INTO MovieInfo (Title, Year, Runtime, Country, Metascore, IMDBRating)
                VALUES (?,?,?,?,?,?)''', (title,year,runtime,country,metascore,imdb_rating))
    else:
        print("Record already found. No update made.")
    
    # Commits the change and close the connection to the database
    conn.commit()
    conn.close()

In [143]:
def print_database(database):
    
    import sqlite3
    conn = sqlite3.connect(str(database))
    cur=conn.cursor()
    
    for row in cur.execute('SELECT * FROM MovieInfo'):
        print(row)
    conn.close()

In [144]:
def save_in_excel(filename, database):
    
    if filename.split('.')[-1]!='xls' and filename.split('.')[-1]!='xlsx':
        print ("Filename does not have correct extension. Please try again")
        return None
    
    import pandas as pd
    import sqlite3
    
    #df=pd.DataFrame(columns=['Title','Year', 'Runtime', 'Country', 'Metascore', 'IMDB_Rating'])
    
    conn = sqlite3.connect(str(database))
    #cur=conn.cursor()
    
    df=pd.read_sql_query("SELECT * FROM MovieInfo", conn)
    conn.close()
    
    df.to_excel(filename,sheet_name='Movie Info')

In [145]:
def search_movie(title):
    if len(title) < 1 or title=='quit': 
        print("Goodbye now...")
        return None

    try:
        url = serviceurl + urllib.parse.urlencode({'t': title})+apikey
        print(f'Retrieving the data of "{title}" now... ')
        uh = urllib.request.urlopen(url)
        data = uh.read()
        json_data=json.loads(data)
        
        if json_data['Response']=='True':
            print_json(json_data)
            
            # Asks user whether to download the poster of the movie
            if json_data['Poster']!='N/A':
                poster_yes_no=input ('Poster of this movie can be downloaded. Enter "yes" or "no": ').lower()
                if poster_yes_no=='yes':
                    save_poster(json_data)
            # Asks user whether to save the movie information in a local database
            save_database_yes_no=input ('Save the movie info in a local database? Enter "yes" or "no": ').lower()
            if save_database_yes_no=='yes':
                save_in_database(json_data)
        else:
            print("Error encountered: ",json_data['Error'])
    
    except urllib.error.URLError as e:
        print(f"ERROR: {e.reason}")

In [147]:
title = input('\nEnter the name of a movie (enter \'quit\' or hit ENTER to quit): ')
if len(title) < 1 or title=='quit': 
    print("Goodbye now...")
else:
    search_movie(title)


Enter the name of a movie (enter 'quit' or hit ENTER to quit): The Notebook
Retrieving the data of "The Notebook" now... 
--------------------------------------------------
Title: The Notebook
Year: 2004
Rated: PG-13
Released: 25 Jun 2004
Runtime: 123 min
Genre: Drama, Romance
Director: Nick Cassavetes
Writer: Jeremy Leven, Jan Sardi, Nicholas Sparks
Actors: Gena Rowlands, James Garner, Rachel McAdams
Plot: A poor yet passionate young man (Ryan Gosling) falls in love with a rich young woman (Rachel McAdams), giving her a sense of freedom, but they are soon separated because of their social differences.
Language: English
Country: United States
Awards: 12 wins & 10 nominations
Ratings: [{'Source': 'Internet Movie Database', 'Value': '7.8/10'}, {'Source': 'Rotten Tomatoes', 'Value': '53%'}, {'Source': 'Metacritic', 'Value': '53/100'}]
Metascore: 53
imdbRating: 7.8
imdbVotes: 571,092
imdbID: tt0332280
--------------------------------------------------
Poster of this movie can be downloade

In [148]:
print_database('movies.sqlite')

('Titanic', 1997, 194, 'United States, Mexico', 75.0, 7.9)
('Jumanji', 1995, 104, 'United States', 39.0, 7.0)
('To Kill a Mockingbird', 1962, 129, 'United States', 88.0, 8.3)
('The Whale', 2011, 85, 'Canada', 64.0, 8.1)
('Top Gun', 1986, 110, 'United States', 50.0, 6.9)
('I Came By', 2022, 110, 'United Kingdom', 57.0, 6.1)
('The Godfather', 1972, 175, 'United States', 100.0, 9.2)
('Top Gun: Maverick', 2022, 130, 'United States', 78.0, 8.4)
('The Notebook', 2004, 123, 'United States', 53.0, 7.8)


In [149]:
title = input('\nEnter the name of a movie (enter \'quit\' or hit ENTER to quit): ')
if len(title) < 1 or title=='quit': 
    print("Goodbye now...")
else:
    search_movie(title)


Enter the name of a movie (enter 'quit' or hit ENTER to quit): 
Goodbye now...


In [150]:
title = input('\nEnter the name of a movie (enter \'quit\' or hit ENTER to quit): ')
if len(title) < 1 or title=='quit': 
    print("Goodbye now...")
else:
    search_movie(title)


Enter the name of a movie (enter 'quit' or hit ENTER to quit): 
Goodbye now...


In [151]:
title = input('\nEnter the name of a movie (enter \'quit\' or hit ENTER to quit): ')
if len(title) < 1 or title=='quit': 
    print("Goodbye now...")
else:
    search_movie(title)


Enter the name of a movie (enter 'quit' or hit ENTER to quit): 
Goodbye now...


In [152]:
print_database('movies.sqlite')

('Titanic', 1997, 194, 'United States, Mexico', 75.0, 7.9)
('Jumanji', 1995, 104, 'United States', 39.0, 7.0)
('To Kill a Mockingbird', 1962, 129, 'United States', 88.0, 8.3)
('The Whale', 2011, 85, 'Canada', 64.0, 8.1)
('Top Gun', 1986, 110, 'United States', 50.0, 6.9)
('I Came By', 2022, 110, 'United Kingdom', 57.0, 6.1)
('The Godfather', 1972, 175, 'United States', 100.0, 9.2)
('Top Gun: Maverick', 2022, 130, 'United States', 78.0, 8.4)
('The Notebook', 2004, 123, 'United States', 53.0, 7.8)


In [153]:
save_in_excel('test.xlsx','movies.sqlite')

In [154]:
import pandas as pd
df=pd.read_excel('test.xlsx')
df

Unnamed: 0.1,Unnamed: 0,Title,Year,Runtime,Country,Metascore,IMDBRating
0,0,Titanic,1997,194,"United States, Mexico",75,7.9
1,1,Jumanji,1995,104,United States,39,7.0
2,2,To Kill a Mockingbird,1962,129,United States,88,8.3
3,3,The Whale,2011,85,Canada,64,8.1
4,4,Top Gun,1986,110,United States,50,6.9
5,5,I Came By,2022,110,United Kingdom,57,6.1
6,6,The Godfather,1972,175,United States,100,9.2
7,7,Top Gun: Maverick,2022,130,United States,78,8.4
8,8,The Notebook,2004,123,United States,53,7.8


In [113]:
df.shape

(8, 7)

In [114]:
df_01 = df.copy()
df_01.index = np.arange(1, len(df) +1)
df_01

Unnamed: 0.1,Unnamed: 0,Title,Year,Runtime,Country,Metascore,IMDBRating
1,0,Titanic,1997,194,"United States, Mexico",75,7.9
2,1,Jumanji,1995,104,United States,39,7.0
3,2,To Kill a Mockingbird,1962,129,United States,88,8.3
4,3,The Whale,2011,85,Canada,64,8.1
5,4,Top Gun,1986,110,United States,50,6.9
6,5,I Came By,2022,110,United Kingdom,57,6.1
7,6,The Godfather,1972,175,United States,100,9.2
8,7,Top Gun: Maverick,2022,130,United States,78,8.4


In [73]:
df_03 = df_01.drop(['Unnamed: 0'], axis=1)
df_03

Unnamed: 0,Title,Year,Runtime,Country,Metascore,IMDBRating
1,Titanic,1997,194,"United States, Mexico",75,7.9
2,Jumanji,1995,104,United States,39,7.0
3,To Kill a Mockingbird,1962,129,United States,88,8.3
4,The Whale,2011,85,Canada,64,8.1
5,Top Gun,1986,110,United States,50,6.9
6,I Came By,2022,110,United Kingdom,57,6.1
7,The Godfather,1972,175,United States,100,9.2
8,Top Gun: Maverick,2022,130,United States,78,8.4


In [74]:
df_03["IMDBRating"]

1    7.9
2    7.0
3    8.3
4    8.1
5    6.9
6    6.1
7    9.2
8    8.4
Name: IMDBRating, dtype: float64

In [77]:
movies_average_rating=df_03.groupby('Title')['IMDBRating'].mean().sort_values(ascending=False).reset_index().rename(columns={'IMDBRating':'Average IMDB Rating'})
movies_average_rating.head(10)

Unnamed: 0,Title,Average IMDB Rating
0,The Godfather,9.2
1,Top Gun: Maverick,8.4
2,To Kill a Mockingbird,8.3
3,The Whale,8.1
4,Titanic,7.9
5,Jumanji,7.0
6,Top Gun,6.9
7,I Came By,6.1


In [80]:
df_03["Title"]

1                  Titanic
2                  Jumanji
3    To Kill a Mockingbird
4                The Whale
5                  Top Gun
6                I Came By
7            The Godfather
8        Top Gun: Maverick
Name: Title, dtype: object

In [160]:
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
##from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [161]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_03['Title'])

In [162]:
tfidf_matrix.shape

(8, 11)

In [163]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [164]:
cosine_sim[0]

array([1., 0., 0., 0., 0., 0., 0., 0.])

In [165]:
titles = df_03['Title']
indices = pd.Series(df_03.index, index=df_03['Title'])

In [166]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [169]:
get_recommendations('Titanic').head(10)

1                  Titanic
3    To Kill a Mockingbird
4                The Whale
5                  Top Gun
6                I Came By
7            The Godfather
8        Top Gun: Maverick
Name: Title, dtype: object

Sourrces: 
https://www.kaggle.com/code/rounakbanik/movie-recommender-systems
https://towardsdatascience.com/step-by-step-guide-to-build-your-own-mini-imdb-database-fc39af27d21b
