# Machine Learning

### Essential Libraries

In [106]:
#import necessary libraries 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


### Importing dataset

In [107]:
moviedataset = pd.read_csv("datasets/cleaned-movie-dataset-index.csv", index_col=0)
moviedataset.head()

Unnamed: 0,index,id,title,adult,popularity,budget,revenue,vote_count,vote_average,release_date,release_year,original_language,genre,casts,keywords,recommendations,similar_movies,title_duplicate
0,0,19995,Avatar,False,432.199,237000000,2920357254,28759,7.569,2009-12-15,2009,en,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['Sam Worthington', 'Zoe Saldaña', 'Sigourney ...","['culture clash', 'future', 'space war', 'spac...","['Capturing Avatar', 'Avatar: Creating the Wor...","['The Reckless Hour', 'MicroPlanet 3D', 'If I ...",AVATAR
1,1,299534,Avengers: Endgame,False,152.323,356000000,2799439100,22908,8.267,2019-04-24,2019,en,"['Adventure', 'Science Fiction', 'Action']","['Robert Downey Jr.', 'Chris Evans', 'Mark Ruf...","['space travel', 'time travel', 'time machine'...","['Avengers: Infinity War', 'Captain Marvel', '...","['Blankman', 'Santos', 'The Masters of Time', ...",AVENGERSENDGAME
2,2,76600,Avatar: The Way of Water,False,10255.685,460000000,2309660236,6285,7.74,2022-12-14,2022,en,"['Science Fiction', 'Adventure', 'Action']","['Sam Worthington', 'Zoe Saldaña', 'Sigourney ...","['loss of loved one', 'dying and death', 'alie...","['Capturing Avatar', 'Avatar: Creating the Wor...","['Cyber Ninja', 'Spenser: A Savage Place', 'Br...",AVATARTHEWAYOFWATER
3,3,597,Titanic,False,137.265,200000000,2187463944,22650,7.892,1997-11-18,1997,en,"['Drama', 'Romance']","['Leonardo DiCaprio', 'Kate Winslet', 'Billy Z...","['drowning', 'evacuation', 'shipwreck', 'icebe...","['The Lion King', 'Pirates of the Caribbean: T...","['The Reckless Hour', 'The Naked Flame', 'Bell...",TITANIC
4,4,140607,Star Wars: The Force Awakens,False,62.371,245000000,2068223624,17860,7.302,2015-12-15,2015,en,"['Adventure', 'Action', 'Science Fiction', 'Fa...","['Harrison Ford', 'Mark Hamill', 'Carrie Fishe...","['android', 'spacecraft', 'space opera']","['Star Wars: The Last Jedi', 'Star Wars: Episo...","['MicroPlanet 3D', 'Geography of the Universe'...",STARWARSTHEFORCEAWAKENS


### Helper Functions

In [108]:
#helper functions
def get_title_from_index(index):
	return moviedataset[moviedataset.index == index]["title"].values[0]

def get_index_from_title(title):
    if title not in moviedataset['title_duplicate'].values:
        return -1
    else:
        return moviedataset[moviedataset.title_duplicate == title]["index"].values[0]

### Getting the count matrix

In [109]:
features = ['keywords','casts','genre','title_duplicate']
intfeatures = ['vote_average']
for feature in features:
    moviedataset[feature] = moviedataset[feature].fillna('')

cv = CountVectorizer()

count_matrix = {}
for feature in features:
    count_matrix[feature] = cv.fit_transform(moviedataset[feature])
simval = {}
for feature in features:
    simval[feature] = cosine_similarity(count_matrix[feature])
     
simval['final'] = simval['keywords'] + simval['genre'] + simval['casts'] + simval['title_duplicate']
#print(similarity_matrix)




### Time to recommend a movie

In [110]:
#get a user input on a movie

fav_movie = input("Enter Your Favourite Movie")
#standardize all uppercase and remove special symbols and spaces
fav_movie = fav_movie.upper()
fav_movie = fav_movie.replace(" " , "")
fav_movie = fav_movie.replace(":" , "")

#get index of movie
movieindex = get_index_from_title(fav_movie)

#check for valid input / prompt user again if invalid input
while (movieindex == -1):
    print("Movie not in database, please select another movie.\n(Try fullname of movie e.g. 'The Avengers' instead of 'Avengers')\n")
    fav_movie = input("Enter Your Favourite Movie")
    fav_movie = fav_movie.upper()
    fav_movie = fav_movie.replace(" " , "")
    fav_movie = fav_movie.replace(":" , "")
    movieindex = get_index_from_title(fav_movie)

#asking users if want a recent movie
modern = input("Do you want a recent movie? Y/N").upper()
if modern not in ['Y', 'N']:
    modern = input("Please enter Y or N")
    
#build similarity matrix 
similar_movies = {}
similar_movies['final'] =  list(enumerate(simval['final'][movieindex]))

#sort similarity matrix
sorted_similar_movies = {}
sorted_similar_movies['final'] = sorted(similar_movies['final'],key=lambda x:x[1],reverse=True)

## set threshhold for a 'good' movie
threshhold = 7.5 ##cutoff for a good movie

## Step 8: Print titles of first 10 movies based off of
i=0
for element in sorted_similar_movies['final']:
    # Get the index and rating of the movie
    movie_index = element[0]
    movie_rating = moviedataset.iloc[movie_index]['vote_average']
    movie_year = moviedataset.iloc[movie_index]['release_year']
    if movie_index == movieindex:
            print(f'Selected Movie: {get_title_from_index(movie_index)}')
            print("----------------------------------------------")
            print('Recommended Movies:')
    # Check if the rating is above the threshold
    elif movie_rating >= threshhold: ##only recommend 'good movies'
        if modern == 'Y':
            if(movie_year >= 2005):##recommend modern movies only
                print(f"{i+1}. {get_title_from_index(movie_index)}", sep='')
                i += 1
        else:
            print(f"{i+1}. {get_title_from_index(movie_index)}", sep='')
            i += 1  
        # Check if we have printed 10 movies
    if i >= 10:
        break

Movie not in database, please select another movie.
(Try fullname of movie e.g. 'The Avengers' instead of 'Avengers')

Selected Movie: The Avengers
----------------------------------------------
Recommended Movies:
1. Captain America: The Winter Soldier
2. Guardians of the Galaxy Vol. 2
3. Avengers: Endgame
4. Avengers: Infinity War
5. Iron Man
6. Guardians of the Galaxy
7. Spider-Man: Into the Spider-Verse
8. Spider-Man: No Way Home
9. Thor: Ragnarok
10. X-Men: Days of Future Past
