In [27]:
import pandas as pd
from collections import defaultdict

In [28]:
pwd

'/Users/ioanwilliams/notebooks/Recommendation Systems/ml-latest-small'

In [29]:
movies = pd.read_csv(r"movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [30]:
all_ratings = pd.read_csv(r"all_ratings.csv")
all_ratings.tail()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating
100723,719,672,148626,4.0
100724,720,672,150548,3.0
100725,721,672,152091,1.0
100726,722,672,159093,3.0
100727,723,672,160438,3.0


In [31]:
def load_movie_lens_data(lens_movies, ratings):
    movies={}
    for _, row in lens_movies.iterrows():
        movies[row["movieId"]] = row["title"]
    
    prefs=defaultdict(dict)
    for _, row in ratings.iterrows():
        if row["movieId"] in movies:
            prefs[row["userId"]][movies[row["movieId"]]] = float(row["rating"])
    return prefs

In [32]:
prefs = load_movie_lens_data(movies, all_ratings)

In [33]:
from math import sqrt
def sim_pearson(prefs,o1,o2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[o1]:
        if item in prefs[o2]: 
            si[item]=1
    
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: 
        return 0
    
    # Add up all the preferences
    sum1=sum([prefs[o1][it] for it in si])
    sum2=sum([prefs[o2][it] for it in si])
    
    # Sum up the squares
    sum1Sq=sum([pow(prefs[o1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[o2][it],2) for it in si])
    
    # Sum up the products
    pSum=sum([prefs[o1][it]*prefs[o2][it] for it in si])
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: 
        return 0
    
    r=num/den
    return r

def sim_distance(prefs,o1,o2):
    # Get the list of shared_items
    si={}
    for item in prefs[o1]:
        if item in prefs[o2]:
            si[item]=1

    # if they have no ratings in common, return 0
    if len(si)==0: 
        return 0
    # Add up the squares of all the differences
    sum_of_squares=sum([pow(prefs[o1][item]-prefs[o2][item],2) for item in prefs[o1] if item in prefs[o2]])
    
    return 1/(1+sum_of_squares)

def sim_jaccard(prefs,o1,o2):

    count=0
    for item in prefs[o1]:
        if item in prefs[o2]:
            count+=1

    n1=len(prefs[o1])
    n2=len(prefs[o2])
    count=float(count)

    r=float(count/(n1+n2-count))

    return r

def sim_cosine(prefs,o1,o2):
    list={}
    num=0
    den=0
    for item in prefs[o1]:
        if item in prefs[o2]:
            list[item]=1
            num+=prefs[o1][item]*prefs[o2][item]

    if num==0:
        return 0

    den= pow( sum(pow(prefs[o1][item],2) for item in list)*sum(pow(prefs[o2][item],2) for item in list) , 0.5 )
    num=float(num)

    return num/den

In [34]:
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: 
            continue
        sim=similarity(prefs,person,other)
        # ignore scores of zero or lower
        if sim<=0: 
            continue
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings

In [35]:
getRecommendations(prefs, 672)[0:10]

[(5.000000000000001, 'Wrong Cops (2013)'),
 (5.000000000000001, 'Wrong (2012)'),
 (5.000000000000001, 'Waiter (Ober) (2006)'),
 (5.000000000000001, 'Voyeur (Abel) (1986)'),
 (5.000000000000001,
  'Through the Olive Trees (Zire darakhatan zeyton) (1994)'),
 (5.000000000000001, 'The Last Days of Emma Blank (2009)'),
 (5.000000000000001, 'The Dress (1996)'),
 (5.000000000000001, 'Schneider vs. Bax (2015)'),
 (5.000000000000001, 'Reality (2014)'),
 (5.000000000000001, 'Patience Stone, The (2012)')]

In [36]:
len(prefs[672])

724

In [37]:
y = prefs[672]
for z in y.keys():
    print(z,":",y[z])

Toy Story (1995) : 3.0
Jumanji (1995) : 2.5
Heat (1995) : 4.0
GoldenEye (1995) : 4.0
American President, The (1995) : 2.0
Casino (1995) : 4.0
Sense and Sensibility (1995) : 2.5
Copycat (1995) : 2.5
Leaving Las Vegas (1995) : 3.0
Dangerous Minds (1995) : 3.0
Twelve Monkeys (a.k.a. 12 Monkeys) (1995) : 4.0
Clueless (1995) : 1.0
Seven (a.k.a. Se7en) (1995) : 5.0
Pocahontas (1995) : 0.5
Usual Suspects, The (1995) : 4.5
Mr. Holland's Opus (1995) : 3.0
Broken Arrow (1996) : 2.5
Happy Gilmore (1996) : 2.5
Braveheart (1995) : 3.5
Bad Boys (1995) : 3.0
Apollo 13 (1995) : 4.0
Canadian Bacon (1995) : 2.0
Crimson Tide (1995) : 4.5
Desperado (1995) : 2.0
Die Hard: With a Vengeance (1995) : 3.0
Net, The (1995) : 2.5
Species (1995) : 2.0
Strange Days (1995) : 2.5
Under Siege 2: Dark Territory (1995) : 2.5
Waterworld (1995) : 2.5
Clerks (1994) : 3.0
Dumb & Dumber (Dumb and Dumber) (1994) : 1.5
Interview with the Vampire: The Vampire Chronicles (1994) : 2.5
Junior (1994) : 1.0
Star Wars: Episode IV - A

In [38]:
getRecommendations(prefs, 672,similarity=sim_cosine)[0:10]

[(5.000000000000001, 'World of Tomorrow (2015)'),
 (5.000000000000001, 'Woman on Top (2000)'),
 (5.000000000000001, 'Victoria (2015)'),
 (5.000000000000001, 'Two Escobars, The (2010)'),
 (5.000000000000001, 'Thief of Bagdad, The (1924)'),
 (5.000000000000001, 'The Car (1977)'),
 (5.000000000000001, 'Survive and Advance (2013)'),
 (5.000000000000001, 'Six Shooter (2004)'),
 (5.000000000000001, 'Room, The (2003)'),
 (5.000000000000001, 'Robin Williams: Weapons of Self Destruction (2009)')]

In [39]:
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}

    # Loop over items rated by this user
    for (item,rating) in userRatings.items():
        # Loop over items similar to this one
        for (similarity,item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings: continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
    # Divide each total score by total weighting to get an average
    rankings=[(round(score/totalSim[item],1),item) for item,score in scores.items( ) if totalSim[item] !=0]
    # Return the rankings from highest to lowest
    rankings.sort( )
    rankings.reverse( )
    return rankings

def calculateSimilarItems(prefs,n=10):
    # Create a dictionary of items showing which other items they
    # are most similar to.
    result={}
    # Invert the preference matrix to be item-centric
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        # Status updates for large datasets
        c+=1
        if c%1000==0: print("%d / %d" % (c,len(itemPrefs)))
        # Find the most similar items to this one
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_pearson)
        result[item]=scores
    return result

def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            # Flip item and person
            result[item][person]=prefs[person][item]
    return result

# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other) for other in prefs if other!=person]
    # Sort the list so the highest scores appear at the top
    scores.sort( )
    scores.reverse( )
    return scores[0:n]

In [40]:
for user in prefs:
    if user == 15.0:
        print("user: ",user)
        ratings = prefs[user]
        for a, b in ratings.items():
            print("*",a,":",b)

user:  15.0
* Toy Story (1995) : 2.0
* Jumanji (1995) : 2.0
* Father of the Bride Part II (1995) : 4.5
* Heat (1995) : 4.0
* GoldenEye (1995) : 3.0
* American President, The (1995) : 2.5
* Nixon (1995) : 2.5
* Casino (1995) : 3.5
* Sense and Sensibility (1995) : 3.0
* Ace Ventura: When Nature Calls (1995) : 1.0
* Get Shorty (1995) : 4.5
* Copycat (1995) : 2.5
* Leaving Las Vegas (1995) : 3.0
* Twelve Monkeys (a.k.a. 12 Monkeys) (1995) : 4.0
* Babe (1995) : 3.0
* Dead Man Walking (1995) : 1.0
* Clueless (1995) : 2.5
* Mortal Kombat (1995) : 3.0
* Seven (a.k.a. Se7en) (1995) : 5.0
* Usual Suspects, The (1995) : 5.0
* Mighty Aphrodite (1995) : 2.5
* Mr. Holland's Opus (1995) : 2.0
* From Dusk Till Dawn (1996) : 0.5
* Antonia's Line (Antonia) (1995) : 5.0
* Beautiful Girls (1996) : 3.0
* Broken Arrow (1996) : 1.5
* Bottle Rocket (1996) : 4.0
* Happy Gilmore (1996) : 1.0
* Muppet Treasure Island (1996) : 2.0
* Braveheart (1995) : 3.0
* Taxi Driver (1976) : 5.0
* Rumble in the Bronx (Hont fa

In [41]:
for user in prefs:
    ratings = prefs[user]
    print("user: ",user,":","number: ", len(ratings))

user:  1.0 : number:  20
user:  2.0 : number:  76
user:  3.0 : number:  51
user:  4.0 : number:  204
user:  5.0 : number:  100
user:  6.0 : number:  44
user:  7.0 : number:  88
user:  8.0 : number:  116
user:  9.0 : number:  45
user:  10.0 : number:  46
user:  11.0 : number:  38
user:  12.0 : number:  61
user:  13.0 : number:  53
user:  14.0 : number:  20
user:  15.0 : number:  1700
user:  16.0 : number:  29
user:  17.0 : number:  363
user:  18.0 : number:  51
user:  19.0 : number:  423
user:  20.0 : number:  98
user:  21.0 : number:  162
user:  22.0 : number:  220
user:  23.0 : number:  726
user:  24.0 : number:  21
user:  25.0 : number:  26
user:  26.0 : number:  172
user:  27.0 : number:  23
user:  28.0 : number:  50
user:  29.0 : number:  22
user:  30.0 : number:  1011
user:  31.0 : number:  69
user:  32.0 : number:  48
user:  33.0 : number:  138
user:  34.0 : number:  187
user:  35.0 : number:  20
user:  36.0 : number:  104
user:  37.0 : number:  32
user:  38.0 : number:  111
user

In [42]:
itemsim=calculateSimilarItems(prefs,n=50)

1000 / 9064
2000 / 9064
3000 / 9064
4000 / 9064
5000 / 9064
6000 / 9064
7000 / 9064
8000 / 9064
9000 / 9064


In [43]:
itemPrefs=transformPrefs(x)
print(len(itemPrefs))
for item in itemPrefs:
    ratings = itemPrefs[item]
    print("Movie:",item,":","Ratings:", len(ratings))

NameError: name 'x' is not defined

In [None]:
item = "Spice World (1997)"
ratings = itemPrefs[item]
scores=[(sim_pearson(itemPrefs,item,other),other) for other in itemPrefs if other!=item]
# Sort the list so the highest scores appear at the top
scores.sort( )
scores.reverse( )
scores

In [None]:
item = "Spice World (1997)"
print(itemPrefs[item])
item = "Zack and Miri Make a Porno (2008)"
print(itemPrefs[item])

In [None]:
p1 = "Spice World (1997)"
p2 = "Zack and Miri Make a Porno (2008)"
si={}
for item in itemPrefs[p1]:
    if item in itemPrefs[p2]: 
        si[item]=1

x=[itemPrefs[p1][it] for it in si]
y=[itemPrefs[p2][it] for it in si]
print(x)
print(y)
plt.scatter(x, y)
plt.show()

In [None]:
%matplotlib inline
x, y = list(itemPrefs["Spice World (1997)"].keys()), list(itemPrefs["Spice World (1997)"].values())
import seaborn as sns;
sns.set()
import matplotlib as mpl
mpl.rcParams['font.family'] = 'serif'
import matplotlib.pyplot as plt
#fig, ax = plt.subplots(figsize=(16, 9))  # lets do wide screen ratio
#ax.plot(x,y);
plt.scatter(x, y)
plt.show()

In [None]:
%matplotlib inline
x, y = list(itemPrefs["Zack and Miri Make a Porno (2008)"].keys()), list(itemPrefs["Zack and Miri Make a Porno (2008)"].values())
import seaborn as sns;
sns.set()
import matplotlib as mpl
mpl.rcParams['font.family'] = 'serif'
import matplotlib.pyplot as plt
#fig, ax = plt.subplots(figsize=(16, 9))  # lets do wide screen ratio
#ax.plot(x,y);
plt.scatter(x, y)
plt.show()

In [44]:
getRecommendedItems(prefs,itemsim,672)[0:30]

[(5.0, 'Orphanage, The (Orfanato, El) (2007)'),
 (5.0, 'No Mercy (1986)'),
 (5.0, 'Kandahar (Safar e Ghandehar) (2001)'),
 (5.0, 'Front, The (1976)'),
 (5.0, 'Elf (2003)'),
 (5.0, 'Confessions of a Teenage Drama Queen (2004)'),
 (5.0, 'Cage aux Folles, La (1978)'),
 (5.0, 'Cabaret (1972)'),
 (5.0, 'Before and After (1996)'),
 (4.8, 'Weekend (2011)'),
 (4.8, 'Into the Arms of Strangers: Stories of the Kindertransport (2000)'),
 (4.5, 'Tenant, The (Locataire, Le) (1976)'),
 (4.5, 'Resident Evil: Extinction (2007)'),
 (4.5, 'Raging Bull (1980)'),
 (4.5, 'Pitch Perfect 2 (2015)'),
 (4.5, 'Phil Spector (2013)'),
 (4.5, 'Penny Serenade (1941)'),
 (4.5, 'Out of Sight (1998)'),
 (4.5, 'October Sky (1999)'),
 (4.5, "Mood Indigo (L'écume des jours) (2013)"),
 (4.5, 'Ip Man (2008)'),
 (4.5, 'Heidi (1937)'),
 (4.5, 'Foxcatcher (2014)'),
 (4.5, 'Fast Runner, The (Atanarjuat) (2001)'),
 (4.5, 'Fantastic Four (2005)'),
 (4.5, 'Faculty, The (1998)'),
 (4.5, 'City Hall (1996)'),
 (4.5, 'Catch-22 (1970)

In [None]:
userRatings=x[251276]
scores={}
totalSim={}

# Loop over items rated by this user
for (item,rating) in userRatings.items():
    #if item != 'The Blair Witch Project':
    #    continue
    print(item,":", rating)
    # Loop over items similar to this one
    for (similarity,item2) in itemsim[item]:
        # Ignore if this user has already rated this item
        if item2 in userRatings: 
            continue
        print("*",round(similarity,1),":", item2)
        # Weighted sum of rating times similarity
        scores.setdefault(item2,0)
        scores[item2]+=similarity*rating
        # Sum of all the similarities
        totalSim.setdefault(item2,0)
        totalSim[item2]+=similarity

#for item,score in scores.items():
#    print('$',item,":", score,":",totalSim[item])
#    print('$$',item,":", score/totalSim[item])
        
# Divide each total score by total weighting to get an average
rankings=[(round(score/totalSim[item],1),item) for item,score in scores.items() if totalSim[item] !=0]

rankings.sort( )
rankings.reverse( )