In [None]:
import pandas as pd
from collections import defaultdict

In [None]:
#uncomment below for first time usage, then comment out.
#!curl -O http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
#!unzip ml-latest-small.zip

#!curl -O http://files.grouplens.org/datasets/movielens/ml-latest.zip
#!unzip ml-latest.zip

In [None]:
cd ml-latest-small

In [None]:
my_ratings = pd.read_csv(r"../movielens-ratings.csv")
my_ratings.drop(['title','average_rating','imdb_id','tmdb_id'],axis=1,inplace=True)
my_ratings['userId'] = 251276
my_ratings.rename(columns={'movie_id': 'movieId'}, inplace=True)
my_ratings.head()

In [None]:
ratings = pd.read_csv(r"ratings.csv")
ratings.drop('timestamp',axis=1,inplace=True)
print(ratings.dtypes)
ratings.head()

In [None]:
#union my ratings and official ratings
all_ratings = pd.concat([ratings,my_ratings], sort=True)
all_ratings.tail()

In [None]:
movies = pd.read_csv(r"movies.csv")
print(len(movies))
movies.head()

In [None]:
def load_movie_lens_data(lens_movies, ratings):
    movies={}
    for _, row in lens_movies.iterrows():
        movies[row["movieId"]] = row["title"]
    
    prefs=defaultdict(dict)
    for _, row in ratings.iterrows():
        if row["movieId"] in movies:
            prefs[row["userId"]][movies[row["movieId"]]] = float(row["rating"])
    return prefs

In [None]:
x = load_movie_lens_data(movies, all_ratings)

In [None]:
from math import sqrt
def sim_pearson(prefs,o1,o2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[o1]:
        if item in prefs[o2]: 
            si[item]=1
    
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: 
        return 0
    
    # Add up all the preferences
    sum1=sum([prefs[o1][it] for it in si])
    sum2=sum([prefs[o2][it] for it in si])
    
    # Sum up the squares
    sum1Sq=sum([pow(prefs[o1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[o2][it],2) for it in si])
    
    # Sum up the products
    pSum=sum([prefs[o1][it]*prefs[o2][it] for it in si])
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: 
        return 0
    
    r=num/den
    return r

def sim_distance(prefs,o1,o2):
    # Get the list of shared_items
    si={}
    for item in prefs[o1]:
        if item in prefs[o2]:
            si[item]=1

    # if they have no ratings in common, return 0
    if len(si)==0: 
        return 0
    # Add up the squares of all the differences
    sum_of_squares=sum([pow(prefs[o1][item]-prefs[o2][item],2) for item in prefs[o1] if item in prefs[o2]])
    
    return 1/(1+sum_of_squares)

def sim_jaccard(prefs,o1,o2):

    count=0
    for item in prefs[o1]:
        if item in prefs[o2]:
            count+=1

    n1=len(prefs[o1])
    n2=len(prefs[o2])
    count=float(count)

    r=float(count/(n1+n2-count))

    return r

def sim_cosine(prefs,o1,o2):
    list={}
    num=0
    den=0
    for item in prefs[o1]:
        if item in prefs[o2]:
            list[item]=1
            num+=prefs[o1][item]*prefs[o2][item]

    if num==0:
        return 0

    den= pow( sum(pow(prefs[o1][item],2) for item in list)*sum(pow(prefs[o2][item],2) for item in list) , 0.5 )
    num=float(num)

    return num/den

In [None]:
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: 
            continue
        sim=similarity(prefs,person,other)
        # ignore scores of zero or lower
        if sim<=0: 
            continue
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings

In [None]:
getRecommendations(x, 251276)[0:10]

In [None]:
len(x['251276'])

In [None]:
y = x[251276]
for z in y.keys():
    print(z,":",y[z])

In [None]:
getRecommendations(x, 251276,similarity=sim_cosine)[0:10]

In [None]:
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}

    # Loop over items rated by this user
    for (item,rating) in userRatings.items():
        # Loop over items similar to this one
        for (similarity,item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings: continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
    # Divide each total score by total weighting to get an average
    rankings=[(round(score/totalSim[item],1),item) for item,score in scores.items( ) if totalSim[item] !=0]
    # Return the rankings from highest to lowest
    rankings.sort( )
    rankings.reverse( )
    return rankings

def calculateSimilarItems(prefs,n=10):
    # Create a dictionary of items showing which other items they
    # are most similar to.
    result={}
    # Invert the preference matrix to be item-centric
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        # Status updates for large datasets
        c+=1
        if c%1000==0: print("%d / %d" % (c,len(itemPrefs)))
        # Find the most similar items to this one
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_pearson)
        result[item]=scores
    return result

def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            # Flip item and person
            result[item][person]=prefs[person][item]
    return result

# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other) for other in prefs if other!=person]
    # Sort the list so the highest scores appear at the top
    scores.sort( )
    scores.reverse( )
    return scores[0:n]

In [None]:
for user in x:
    if user == 15.0:
        print("user: ",user)
        ratings = x[user]
        for a, b in ratings.items():
            print("*",a,":",b)

In [None]:
for user in x:
    ratings = x[user]
    print("user: ",user,":","number: ", len(ratings))

In [None]:
itemsim=calculateSimilarItems(x,n=50)

In [None]:
itemPrefs=transformPrefs(x)
print(len(itemPrefs))
for item in itemPrefs:
    ratings = itemPrefs[item]
    print("Movie:",item,":","Ratings:", len(ratings))

In [None]:
item = "Spice World (1997)"
ratings = itemPrefs[item]
scores=[(sim_pearson(itemPrefs,item,other),other) for other in itemPrefs if other!=item]
# Sort the list so the highest scores appear at the top
scores.sort( )
scores.reverse( )
scores

In [None]:
item = "Spice World (1997)"
print(itemPrefs[item])
item = "Zack and Miri Make a Porno (2008)"
print(itemPrefs[item])

In [None]:
p1 = "Spice World (1997)"
p2 = "Zack and Miri Make a Porno (2008)"
si={}
for item in itemPrefs[p1]:
    if item in itemPrefs[p2]: 
        si[item]=1

x=[itemPrefs[p1][it] for it in si]
y=[itemPrefs[p2][it] for it in si]
print(x)
print(y)
plt.scatter(x, y)
plt.show()

In [None]:
%matplotlib inline
x, y = list(itemPrefs["Spice World (1997)"].keys()), list(itemPrefs["Spice World (1997)"].values())
import seaborn as sns;
sns.set()
import matplotlib as mpl
mpl.rcParams['font.family'] = 'serif'
import matplotlib.pyplot as plt
#fig, ax = plt.subplots(figsize=(16, 9))  # lets do wide screen ratio
#ax.plot(x,y);
plt.scatter(x, y)
plt.show()

In [None]:
%matplotlib inline
x, y = list(itemPrefs["Zack and Miri Make a Porno (2008)"].keys()), list(itemPrefs["Zack and Miri Make a Porno (2008)"].values())
import seaborn as sns;
sns.set()
import matplotlib as mpl
mpl.rcParams['font.family'] = 'serif'
import matplotlib.pyplot as plt
#fig, ax = plt.subplots(figsize=(16, 9))  # lets do wide screen ratio
#ax.plot(x,y);
plt.scatter(x, y)
plt.show()

In [None]:
getRecommendedItems(x,itemsim,251276)[0:30]

In [None]:
userRatings=x[251276]
scores={}
totalSim={}

# Loop over items rated by this user
for (item,rating) in userRatings.items():
    #if item != 'The Blair Witch Project':
    #    continue
    print(item,":", rating)
    # Loop over items similar to this one
    for (similarity,item2) in itemsim[item]:
        # Ignore if this user has already rated this item
        if item2 in userRatings: 
            continue
        print("*",round(similarity,1),":", item2)
        # Weighted sum of rating times similarity
        scores.setdefault(item2,0)
        scores[item2]+=similarity*rating
        # Sum of all the similarities
        totalSim.setdefault(item2,0)
        totalSim[item2]+=similarity

#for item,score in scores.items():
#    print('$',item,":", score,":",totalSim[item])
#    print('$$',item,":", score/totalSim[item])
        
# Divide each total score by total weighting to get an average
rankings=[(round(score/totalSim[item],1),item) for item,score in scores.items() if totalSim[item] !=0]

rankings.sort( )
rankings.reverse( )

In [None]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(all_ratings, test_size=0.2)
len(train_data), len(test_data)

In [None]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.as_matrix(columns = ['userId', 'movieId', 'rating'])
test_data_matrix = test_data.as_matrix(columns = ['userId', 'movieId', 'rating'])
len(train_data_matrix), len(test_data_matrix)

In [None]:
# Item Similarity Matrix
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
item_correlation = 1 - pairwise_distances(train_data_matrix, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
len(item_correlation)
item_correlation
type(train_data_matrix)

In [None]:
# Function to predict ratings
def predict(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    # Use np.newaxis so that mean_user_rating has same format as ratings
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pred

In [None]:
item_prediction = predict(train_data_matrix, item_correlation)