# Markov Movie Recommender Program


In [1]:
!wget -O moviedataset.zip https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
print('unziping ...')
!unzip -o -j moviedataset.zip

--2020-12-08 09:43:07--  https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
Resolving s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)... 67.228.254.196
Connecting to s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 160301210 (153M) [application/zip]
Saving to: ‘moviedataset.zip’


2020-12-08 09:43:48 (3.78 MB/s) - ‘moviedataset.zip’ saved [160301210/160301210]

unziping ...
Archive:  moviedataset.zip
  inflating: links.csv               
  inflating: movies.csv              
  inflating: ratings.csv             
  inflating: README.txt              
  inflating: tags.csv                


In [2]:
import pandas as pd
from math import sqrt
import numpy as np

In [4]:
#Putting movie data and ratings data from csv into a Pandas DataFrame
moviesDataFrame = pd.read_csv("movies.csv")
ratingsDataFrame = pd.read_csv("ratings.csv")
#Displaying first 5 rows of the movies dataframe
moviesDataFrame.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
#Start preproccessing and changing the dataframe to fit the program's specifications
moviesDataFrame['year'] = moviesDataFrame.title.str.extract('(\(\d\d\d\d\))',expand=False)
moviesDataFrame['year'] = moviesDataFrame.year.str.extract('(\d\d\d\d)',expand=False)
moviesDataFrame['title'] = moviesDataFrame.title.str.replace('(\(\d\d\d\d\))', '')
moviesDataFrame['title'] = moviesDataFrame['title'].apply(lambda x: x.strip())
moviesDataFrame.head()
#Removed year from title and made into a separate coulumn

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [7]:
#Make the Genres into a list in the dataframe
moviesDataFrame["genres"] = moviesDataFrame.genres.str.split('|')
moviesDataFrame.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [8]:
#Start making the dataframe that relates movies to the genres
#Find all the genres that apply to each movie with a 1 and those that don't with a 0
moviesGenresDataFrame = moviesDataFrame.copy()

for i, r in moviesDataFrame.iterrows():
    for g in r["genres"]:
        moviesGenresDataFrame.at[i, g] = 1

moviesGenresDataFrame = moviesGenresDataFrame.fillna(0)
moviesGenresDataFrame.head()


Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Start the second characteristic of movie recommending: ratings
ratingsDataFrame.head()
#Take out time from the dataframe
ratingsDataFrame = ratingsDataFrame.drop("timestamp", 1)
ratingsDataFrame.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [13]:
#Take in a user's input of movies that he or she has watched or likes with the ratings of the movies to act as keys
#Add as many movies as the user wants
userInput = [{'title':'Transformers', 'rating':3.5}, {'title':'Toy Story', 'rating':3.5}, {'title':'Jumanji', 'rating':2}, {'title':"Pirates of the Caribbean: On Stranger Tides", 'rating':3.3}, {'title':'Interstellar', 'rating':4.3}]
#Put the input into a dataframe 
inputMovies = pd.DataFrame(userInput)

#Find movie ID from the movieDataframe that correlates to the User's movies
inputId = moviesDataFrame[moviesDataFrame['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
#Exclude the genres and year from this list
inputMovies = inputMovies.drop('genres', 1).drop('year', 1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,53996,Transformers,3.5
3,86880,Pirates of the Caribbean: On Stranger Tides,3.3
4,109487,Interstellar,4.3


In [15]:
#Begin creating the transition matrix which will include a combination of ratings and genres for the movies 

#Start with genres
moviesUserDataFrame = moviesGenresDataFrame[moviesGenresDataFrame["movieId"].isin(inputMovies["movieId"].tolist())]
#Clean up to only include the genres for the movies
moviesDataFrame = moviesUserDataFrame.reset_index(drop=True)
genreUserDataFrame = moviesUserDataFrame.drop("movieId", 1).drop("title", 1).drop("genres", 1).drop("year", 1)
genreUserDataFrame

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17223,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
