In [1]:
import pandas as pd
import numpy as np

In [2]:
#download the class movie reviews from here:
#https://docs.google.com/spreadsheets/d/17rCJzmWxqvAu9rkpkgt4ToccIlY4A1Ffuu1W9X3B8Ag/
#then read that in as a pandas dataframe

In [3]:
M = pd.read_csv('Movie Reviews.csv', index_col='Name')

In [4]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [5]:
def get_recs(movie_name, M, num):

    import numpy as np
    reviews = []
    for title in M.columns:
        if title == movie_name:
            continue
        cor = pearson(M[movie_name], M[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((title, cor))
    
    reviews.sort(key=lambda tup: tup[1], reverse=True)
    return reviews[:num]

### Question 1:  What movie is most Similar to 'The fault in our stars' (60 pts)


In [6]:
# We can see that the movie most similar to 'The Fault in Our Stars' is '
# Malificent'. Is it weird that I woke up thinking about the movie 
# Malificent today?
recs = get_recs('The Fault in Our Stars', M, 5)
recs

[('Malificent', 0.19685382007557589),
 ('Interstellar', 0.19096909400891598),
 ('Divergent', 0.14138905806703553),
 ('How to Train your Dragon 2', 0.13844260886450729),
 ('Guardians of the Galaxy', 0.12932884801223818)]

### Question 2:  Which movie(s) would you most like to see, based on your classmates experience? (40 pts)

In [7]:
# The movies I've seen are those that I've rated: my_ratings
my_ratings = M.loc['Karla Zillner'].dropna()
my_ratings

The Hunger Games: Mockingjay - Part 1    5.0
Guardians of the Galaxy                  4.0
The Lego Movie                           2.0
The Hobbit                               3.0
Transformers                             2.0
Malificent                               3.0
Godzilla                                 5.0
Interstellar                             4.0
How to Train your Dragon 2               2.0
Divergent                                5.0
300: Rise of an Empire                   5.0
Name: Karla Zillner, dtype: float64

In [8]:
# The movies I've rated sorted according to rating. 
my_sorted_ratings=my_ratings.sort_values(ascending=False)
my_sorted_ratings

300: Rise of an Empire                   5.0
Divergent                                5.0
Godzilla                                 5.0
The Hunger Games: Mockingjay - Part 1    5.0
Interstellar                             4.0
Guardians of the Galaxy                  4.0
Malificent                               3.0
The Hobbit                               3.0
How to Train your Dragon 2               2.0
Transformers                             2.0
The Lego Movie                           2.0
Name: Karla Zillner, dtype: float64

In [9]:
# My favorite movie according to the sorted list of movies I've rated is:
my_sorted_ratings.index[0]

'300: Rise of an Empire'

In [10]:
# These are the correlations between my favorite movie and all other movies.
corrs = get_recs(my_sorted_ratings.index[0], M, 16)
corrs

[('Divergent', 0.31506733582377494),
 ('American Sniper', 0.141848971461246),
 ('Malificent', 0.12701522786962982),
 ('Transformers', 0.12264836559060582),
 ('Godzilla', 0.094653883983967874),
 ('The Hunger Games: Mockingjay - Part 1', 0.087371232556890013),
 ('Unbroken', 0.075354667615717727),
 ('Gone Girl', 0.070297916401539054),
 ('The Hobbit', 0.053325897788541222),
 ('How to Train your Dragon 2', 0.037146231979000785),
 ('Interstellar', -0.0057772421900600763),
 ('Big Hero 6', -0.011697108075607048),
 ('The Fault in Our Stars', -0.061294011326189106),
 ('The Lego Movie', -0.10850183261767435),
 ('Guardians of the Galaxy', -0.2886063706514535)]

In [11]:
# Recommendations for me based on my favorite movie. 
# Notice that I used brute force to weed out the movies I've watched.
# There is probably some easier set difference computation that I could use.

# I will definitely watch American Sniper.

my_rated_movies=my_ratings.index[0:]
for n in range(my_rated_movies.shape[0]):
    if corrs[n][0] in my_rated_movies:
        continue
    if (corrs[n][1] < 0):
        continue
    else:
        print(corrs[n])

('American Sniper', 0.141848971461246)
('Unbroken', 0.075354667615717727)
('Gone Girl', 0.070297916401539054)


### Question 3: Bonus Question...  For all the movies you haven't seen, can you predict how you'd rate them using your the class reviews? (10 pts)

In [12]:
# Here are the movies I have not rated and thus have not watched. 
# Notice that here the set difference worked out nicely. 
not_rated = set(M.columns) - set(my_rated_movies)
not_rated

{'American Sniper',
 'Big Hero 6',
 'Gone Girl',
 'The Fault in Our Stars',
 'Unbroken'}

In [13]:
# Let's see what happens when I compare myself to the other students. 
# Here is the list of students. 
M.index[0:]

Index(['Aarti Jaiswal', 'Aditya Dharmasagar', 'Adrian Cavallaris',
       'Ahmed Muheebuddin', 'Aishwarya reddy', 'Akhilesh', 'Andrew Webb',
       'Anirudh', 'Anirudh Thota', 'anurag',
       ...
       'Mark Dang', 'Jensen Hou', 'Moses Stanley', 'Vibhor Sharma',
       'Jeremy Embalabala', 'Jason Schenck', 'Jason Burrell', 'Karla Zillner',
       'Eric Maxwell', 'Bertina Nguyen'],
      dtype='object', name='Name', length=130)

In [14]:
# To make it easier, lets create a new matrix by transposing M. 
N = M.transpose()

In [15]:
# This is a modified version of Mike's get_recs(). 
# It finds all the correlations between myself and the other classmates. 
corrs = []
for person in N.columns:
    if person == 'Karla Zillner':
        continue
    corr = pearson(N['Karla Zillner'],N[person])
    if np.isnan(corr):
        continue    
    corrs.append((person,corr))
corrs.sort(key=lambda tup: tup[1], reverse=True)

  """


In [16]:
# It looks as though Veerendra Battula and I have similar taste in movies. 
corrs[:15]

[('veerendra battula', 0.50013049761701556),
 ('Jonathan Doll', 0.47014477211853867),
 ('Adrian Cavallaris', 0.46726642516653394),
 ('Susan Huang', 0.44474958999666075),
 ('Gaurav Khandave', 0.44158207820125872),
 ('anurag', 0.44052107185521777),
 ('ugesh reddy challa', 0.41420562258362703),
 ('Jared Knowles', 0.3998834328985888),
 ('Fred Young', 0.3704169839009317),
 ('Daniel Lee', 0.363480236213583),
 ('David Reyling', 0.35484609468860007),
 ('Gaurav Karale', 0.35323992888558414),
 ('Michael Cooper', 0.34767674768255769),
 ('Jason Burrell', 0.34767674768255769),
 ('Jeremy Embalabala', 0.34726602486028413)]

In [17]:
# Let's see how I might rate the movies I have not watched by using the 
# above information.

# Recall that the movies I have not rated are:
not_rated

{'American Sniper',
 'Big Hero 6',
 'Gone Girl',
 'The Fault in Our Stars',
 'Unbroken'}

In [18]:
# Let's see how people with my similar taste in movies would have rated 
# the movies I haven't watched. 
for movie in not_rated:
    sum_ratings = 0
    count = 0
    for person in corrs[:10]:
        if np.isnan(N[person[0]][movie]):
            continue
        else:
            sum_ratings = sum_ratings + N[person[0]][movie]
            count = count + 1
    if count > 0:
        print('Average Score for',movie, 'is', sum_ratings/count)

Average Score for The Fault in Our Stars is 3.0
Average Score for American Sniper is 3.5
Average Score for Gone Girl is 4.5
Average Score for Unbroken is 5.0
Average Score for Big Hero 6 is 4.5


In [19]:
# We can also see which of those movies I have not yet seen are most like 
# the ones I have seen to determine how I would rate those I haven't seen. 
# So based on my ratings and similarities between movies produced by 
# ge_recs():
for movie in not_rated:
    recs = get_recs(movie, M, 5)
    rec = recs[0][0]
    for r in range(4):
        if rec in not_rated:
            rec = recs[r][0]
    print (movie,'is most like',rec)
    print ('I might rate', movie, 'a', M[rec]['Karla Zillner'])
    print ('')

The Fault in Our Stars is most like Malificent
I might rate The Fault in Our Stars a 3.0

American Sniper is most like The Lego Movie
I might rate American Sniper a 2.0

Gone Girl is most like How to Train your Dragon 2
I might rate Gone Girl a 2.0

Unbroken is most like Divergent
I might rate Unbroken a 5.0

Big Hero 6 is most like How to Train your Dragon 2
I might rate Big Hero 6 a 2.0



In [20]:
# I believe the first solution might be more accurate. 
# I will have to test these predictions by having a movie binge.