In [24]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SQLContext
import seaborn as sns
import pandas as pd
import random
import numpy
%matplotlib inline
sqlContext = SQLContext(sc)

#Read in movies. CSV has a header. only selecting columns that are needed
movies = spark.read.csv("movies_metadata.csv",header=True).select("title","id","popularity","imdb_id")

allmoviedf = movies.toPandas()
#Ratings.csv contains timestamp too, but that doesn't matter for this. only selecting columns that are needed
ratings = spark.read.csv("ratings_small.csv",header=True,schema="userId integer, movieId integer, rating float").select("userId","movieId","rating")

#Joining the ratings and movies so we can see which movie ids are present but have no movie titles.
movieratings = ratings.join(movies,ratings["movieId"] == movies["id"])
#dropping these becuase no movie title would be displayed if the movie is recommended!
movieratings = movieratings.dropna(subset=['title'])
movieRatingsDf = movieratings.toPandas()


TypeError: 'RDD' object is not subscriptable

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 16096)
Traceback (most recent call last):
  File "D:\Users\Hamza\Anaconda3\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "D:\Users\Hamza\Anaconda3\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "D:\Users\Hamza\Anaconda3\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "D:\Users\Hamza\Anaconda3\lib\socketserver.py", line 720, in __init__
    self.handle()
  File "C:\Spark\Spark\python\pyspark\accumulators.py", line 269, in handle
    poll(accum_updates)
  File "C:\Spark\Spark\python\pyspark\accumulators.py", line 241, in poll
    if func():
  File "C:\Spark\Spark\python\pyspark\accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "C:\Spark\Spark\pyt

In [17]:
#Creating a seperate dataframe for more well known movies to build the user's profile
#This way, the user will be more likely to see movies they know when the profile building is
#taking place. Otherwise, too many obscure films would show and they would see many films they
#don't know.
moviesdf = movieratings.toPandas()
moviesdf.popularity = pd.to_numeric(moviesdf.popularity, errors='coerce')
moviesdf = moviesdf.dropna(subset=['popularity'])
moviesdf.popularity = moviesdf.popularity. astype(float)
moviesdf = moviesdf.loc[moviesdf['popularity'] > 10]
moviesdf.popularity = moviesdf.popularity. astype(str)


In [18]:
### Create a dataframe to store ratings for user during profile building
userratings = pd.DataFrame(columns = ['userID','movieID','rating','popularity','title','id','imdb_id'])
movieIDList = []
print(len(userratings))
#Method to ask the user ratings for a number of random movies
#sample parameter = number of ratings that should be taken from user
#The more ratings, the more accurate the results!
def buildProfile(sample):
    #high id so that user's recommendations can be later identified
    userid = 99999999
    #get details of a random movie
    randomNum = random.randint(0, len(moviesdf))
    movid = moviesdf['id'].iloc[randomNum]
    movie = moviesdf['title'].iloc[randomNum]
    #make sure user hasn't been asked about this movie already
    if (movid in movieIDList):
        buildProfile(sample)
    #ask user's opinion
    userInput = input('What would you give this movie: ' + movie + "?")
    try:
        rating = float(userInput)
        #input must be between 1 and 5!
        if (float(userInput) > 0 and float(userInput) <= 5 ):
            #save rating into dataframe if it is valid
            row = [int(userid),int(movid),float(userInput),str('0'),str(movie),int(movid),moviesdf['imdb_id'].iloc[randomNum]]
            userratings.loc[len(userratings)] = row
            movieIDList.append(movid)
        else:
            #error handling
            print('Rating must be between 1 and 5!')
            #recursive, so if error, user will be asked again. rating will NOT be counted
            buildProfile(sample)
    except ValueError:
        #More error handling
        print('Movie Not Seen/Invalid input')
        movieIDList.append(movid)
        buildProfile(sample)
    #Continue asking user until the correct number of ratings have been entered
    if (len(userratings) < sample):
        print(len(userratings))
        buildProfile(sample)
        
print('Rate movies on a scale of 1 to 5 (1 is very bad, 5 is very good). If you have not seen any, please enter "NA" and you will be asked about another movie')
#Ask user their opinion on 15 movies
buildProfile(15) 
#join user's ratings with dataset ratings
movieratings = movieratings.union(sqlContext.createDataFrame(userratings))

0
Rate movies on a scale of 1 to 5 (1 is very bad, 5 is very good). If you have not seen any, please enter "NA" and you will be asked about another movie
What would you give this movie: Madagascar?5
1
What would you give this movie: Solaris?4
2
What would you give this movie: Saw?3
3
What would you give this movie: Heathers?4
4
What would you give this movie: The Devil Wears Prada?5
5
What would you give this movie: Star Trek IV: The Voyage Home?2
6
What would you give this movie: Rocky Balboa?4
7
What would you give this movie: Alien?2
8
What would you give this movie: The Conversation?3
9
What would you give this movie: School of Rock?4
10
What would you give this movie: Barry Lyndon?0
Rating must be between 1 and 5!
What would you give this movie: Snatch?3
11
What would you give this movie: Reign Over Me?4
12
What would you give this movie: Die Hard 2?2
13
What would you give this movie: Ghost Rider?3
14
What would you give this movie: Donnie Darko?4


In [19]:
#### Splitting data between training/testing
(training, test) = movieratings.randomSplit([0.8, 0.2])
#initialise ALS model (Alternating Least Square)
#This uses collaborative filtering to come up with recommendations for both the user
#as well as each individual user in the dataset who added ratings
als = ALS(userCol='userId', itemCol='movieId', ratingCol='rating')#,coldStartStrategy="drop"
ALSModel = als.fit(training)
predictions = ALSModel.transform(test)

#Recommend 10 movies for all users
userRecs = ALSModel.recommendForAllUsers(10).toPandas()

#Get recommendations just for current user (id 99999999 set earlier)
recommendations = userRecs[userRecs['userId'] == 99999999]


In [20]:
#Set up columns
recommendedMoviesTable = pd.DataFrame(columns = ["MovieID",'Recommended Movie','Predicted Score',"IMDb Page"])
recs = recommendations['recommendations'].iloc[0]
#for each recommendation
for i in recs:
    #get the movie from the dataframe
    mov= movieRatingsDf[movieRatingsDf['id'] == str(i['movieId'])]
    try:
        #get the title and add it to the row
        movName = mov['title'].values[0]
        newRow = [i['movieId'], movName, i['rating'],"https://www.imdb.com/title/" + mov['imdb_id'].values[0]]
        recommendedMoviesTable.loc[len(recommendedMoviesTable)] = newRow
    #Error handling
    except:
        newRow = [i['movieId'], "Movie Not Found", i['rating'],"NA"]
        recommendedMoviesTable.loc[len(recommendedMoviesTable)] = newRow
#Show
recommendedMoviesTable

Unnamed: 0,MovieID,Recommended Movie,Predicted Score,IMDb Page
0,492,Being John Malkovich,4.965824,https://www.imdb.com/title/tt0120601
1,850,A Christmas Story,4.8854,https://www.imdb.com/title/tt0085334
2,46578,Cheerleaders' Wild Weekend,4.801053,https://www.imdb.com/title/tt0078960
3,2267,The Last Mimzy,4.800508,https://www.imdb.com/title/tt0768212
4,668,On Her Majesty's Secret Service,4.787314,https://www.imdb.com/title/tt0064757
5,1361,The Return of the King,4.761446,https://www.imdb.com/title/tt0079802
6,49957,Carts of Darkness,4.752576,https://www.imdb.com/title/tt1633175
7,4459,Night Without Sleep,4.693513,https://www.imdb.com/title/tt0044967
8,26974,Pulgasari,4.677653,https://www.imdb.com/title/tt0089851
9,54328,Ocean Heaven,4.650294,https://www.imdb.com/title/tt1498858


## 