In [247]:
import pandas as pd
import numpy as np
from math import sqrt

In [248]:
#download the class movie reviews from here:
#https://docs.google.com/spreadsheets/d/17rCJzmWxqvAu9rkpkgt4ToccIlY4A1Ffuu1W9X3B8Ag/
#then read that in as a pandas dataframe

In [249]:
M = pd.read_csv("MovieReviews.csv", index_col='Name')

In [250]:
# Removing nan indexes
M = M.ix[:-3]
index = M.index.values

In [251]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [252]:
def get_recs(movie_name, M, num):

    import numpy as np
    reviews = []
    for title in M.columns:
        if title == movie_name:
            continue
        cor = pearson(M[movie_name], M[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((title, cor))
    
    reviews.sort(key=lambda tup: tup[1], reverse=True)
    return reviews[:num]

    

###Question 1:  What movie is most Similar to 'The fault in our stars' (60 pts)


In [253]:
Q1 = get_recs('The Fault in Our Stars', M, 1)

In [254]:
# Malificent (which is actually spelled incorrectly...should be Maleficent) is most similar to "The Fault in Our Stars"
Q1

[('Malificent', 0.39873118100855826)]

###Question 2:  Which movie(s) would you most like to see, based on your classmates experience? (40 pts)

In [255]:
M.shape

(73, 16)

In [256]:
# Movies I would most like to see (based on classmates experience)

#the movies I've seen
my_ratings = M.ix['Heather Sanders'].dropna()
#the movie I've rated the highest
my_ratings.sort_values(ascending=False, inplace=True)
#the name of the movie I've rated the highest
fav_movie = my_ratings.index[0]
Q2 = get_recs(fav_movie, M, 16)
Q2

[('Malificent', 0.26226430346247115),
 ('Interstellar', 0.22177313117448672),
 ('Divergent', 0.17331725497101289),
 ('The Fault in Our Stars', 0.11298392969275164),
 ('Gone Girl', 0.070310485168203027),
 ('Transformers', 0.028543823686292089),
 ('Godzilla', 0.024981686808716735),
 ('Guardians of the Galaxy', -0.023637859538798978),
 ('300: Rise of an Empire', -0.038479217627049797),
 ('How to Train your Dragon 2', -0.059165175911904386),
 ('Big Hero 6', -0.0786067182032716),
 ('The Hunger Games: Mockingjay - Part 1', -0.11287306049434476),
 ('The Lego Movie', -0.13387160543910256),
 ('The Hobbit', -0.18935397430930867),
 ('American Sniper', -0.21821936037851608)]

In [257]:
# These recommendations are not bad.  I would agree with several of the recommendations.

###Question 3: Bonus Question...  For all the movies you haven't seen, can you predict how you'd rate them using your the class reviews? (10 pts)

In [259]:
# I grabbed the code from here: https://dataaspirant.com/?s=recommend
# And tweaked it quite a bit
def pearson_correlation(person1,person2):
    person1 = M.ix[person1]
    person2 = M.ix[person2]
    
    # To get both rated items
    p1_rated = {}
    p2_rated = {}
    for x in range(0, (len(person1)-1)):
        if ~(np.isnan(person1[x])):
            if ~(np.isnan(person2[x])):
                p1_rated[x] = person1[x]
                p2_rated[x] = person2[x]

    number_of_ratings = len(p1_rated) 

    # Checking for number of ratings in common
    if number_of_ratings == 0:
        return 0

    # Add up all the preferences of each user
    person1_preferences_sum = sum([p1_rated[item] for item in p1_rated])
    person2_preferences_sum = sum([p2_rated[item] for item in p2_rated])

    # Sum up the squares of preferences of each user
    person1_square_preferences_sum = sum([pow(p1_rated[item],2) for item in p1_rated])
    person2_square_preferences_sum = sum([pow(p2_rated[item],2) for item in p2_rated])


    # Sum up the product value of both preferences for each item
    product_sum_of_both_users = 0
    for x in p1_rated:
        product = (p1_rated[x] * p2_rated[x])
        product_sum_of_both_users += product

    # Calculate the pearson score
    numerator_value = product_sum_of_both_users - (person1_preferences_sum*person2_preferences_sum/number_of_ratings)
    denominator_value = float(sqrt((person1_square_preferences_sum - pow(person1_preferences_sum,2)/number_of_ratings) * (person2_square_preferences_sum -pow(person2_preferences_sum,2)/number_of_ratings)))
    if denominator_value == 0:
        return 0
    else:
        r = numerator_value/float(denominator_value)
        return r 

In [260]:
def most_similar_users(person,number_of_users):
    # returns the number_of_users (similar persons) for a given specific person.
    scores = [(pearson_correlation(person,other_person),other_person)for other_person in index if other_person != person ]
 
    # Sort the similar persons so that highest scores person will appear at the first
    scores.sort()
    scores.reverse()
    return scores[0:number_of_users]

In [261]:
print most_similar_users('Heather Sanders', 73)

[(1.0, 'ugesh reddy challa'), (1.0, 'sai bhargav musuluri'), (1.0, 'Aswini kumar'), (0.8703882797784892, 'Sunil Nandikanti'), (0.86602540378444015, 'Aarti Jaiswal'), (0.86602540378443849, 'Akhilesh'), (0.81649658092772615, 'Jared Knowles'), (0.77459666924148318, 'Shiva rama raju Chekuri'), (0.53452248382484924, 'Trudy Kline'), (0.53452248382484902, 'veerendra battula'), (0.53452248382484813, 'James Goodpasture'), (0.49999999999999933, 'Olumuyiwa Durojaiye'), (0.49143609346716077, 'Graham Bullard'), (0.42008402520840121, 'Anirudh'), (0.41522739926869928, 'Arun Vedere'), (0.40824829046386091, 'Syed Abbas Hussaini'), (0.37387825055298302, 'Rajesh Kinkirla'), (0.35355339059327595, 'Sai Sampath'), (0.35355339059327595, 'Aishwarya reddy'), (0.35355339059327451, 'M A Nissar'), (0.31622776601683855, 'surya chandra reddy kovvuri'), (0.25819888974716121, 'Aditya Dharmasagar'), (0.24999999999999944, 'Anirudh Thota'), (0.23830156190969951, 'HJ SONG'), (0.23809523809523808, 'solomon choppara'), (0.

In [262]:
def user_recommendations(person):
    p1 = M.ix[person]
    # Gets recommendations for a person by using a weighted average of every other user's rankings
    totals = {}
    simSums = {}
    rankings_list =[]
    for other in index:
        p2 = M.ix[other]

        # don't compare me to myself
        if other == person:
            continue
        sim = float(pearson_correlation(person,other))

        # ignore scores of zero or lower
        if sim <=0:
            continue
            
        for x in range(0, len(p2)-1):
            if np.isnan(p1[x]):
                if ~(np.isnan(p2[x])):
                    # Similarity * score
                    prod = p2[x] * sim
                    totals.update({p2.index[x]: prod})
                    # sum of similarities
                    simSums.update({p2.index[x]: sim})
                    
    # Create the normalized list

    rankings = [(total/simSums[item],item) for item,total in totals.items()]
    rankings.sort()
    rankings.reverse()
    # returns the recommended items
    
    recommendataions_list = [recommend_item for score,recommend_item in rankings]
    return recommendataions_list
    
print "Recommendations for Heather"
print user_recommendations('Heather Sanders')

Recommendations for Heather
['Interstellar', 'Godzilla', 'American Sniper', 'Guardians of the Galaxy', 'Gone Girl']
