In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

#from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#from src.item_recommender import ItemRecommender

import re

In [3]:
# Read in the training file as RATINGS
ratings = pd.read_csv('data/training.csv')
# Convert timestamp to datetime
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [4]:
# Create a TRAINING and VALIDATION set. validation set will be the
# last 20% of the training set, ordered by date
ratings = ratings.sort_values(by='timestamp').reset_index(drop=True)
training = ratings.loc[:len(ratings)*.8-1, :].copy()
validation = ratings.loc[len(ratings)*.8:, :].copy()

In [5]:
# Load in USERS
users = pd.read_table('data/users.dat', delimiter='::', header=None, 
                      names=['user', 'sex', 'age', 'occupation', 'zip_code'], 
                      engine='python')

In [54]:
# Load in MOVIES
movies = pd.read_table('data/movies.dat', delimiter='::', header=None, 
                       names=['movie', 'movie_title', 'genre'], 
                       engine='python')
movies['genre'] = movies['genre'].apply(lambda x: x.split("|"))
# Convert genre to a bag of words
movies['genre'] = [','.join(map(str,l)) for l in movies['genre']]
movies.head(1)

Unnamed: 0,movie,movie_title,genre
0,1,Toy Story (1995),"Animation,Children's,Comedy"


## Count Vectorizer

In [61]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(movies['genre'])
indices = pd.Series(movies.movie_title)

In [62]:
class ItemRecommender():
    '''
    Content based item recommender
    '''
    def __init__(self, similarity_measure=None):
        self.similarity_matrix = None
        self.item_names = None

        if similarity_measure == None:
            self.similarity_measure = cosine_similarity
        else:
            self.similarity_measure = similarity_measure

    
    def fit(self, X, index=None):
        '''
        Takes a numpy array of the item attributes and creates the similarity matrix

        INPUT -
            X: NUMPY ARRAY - Rows are items, columns are feature values / or DF
            index: LIST - List of the item names/titles in order of the numpy arrray
        
        OUTPUT - None


        Notes:  You might want to keep titles and X as attributes to refer to them later

        Create the a similarity matrix of item to item similarity
        '''

        # While keeping this as a sparse matrix would be best the cosign sim
        # function returns a array so there is no reason.
        
        if isinstance(X, pd.DataFrame):
            self.item_counts = X
            self.item_names = X.index
            self.similarity_df = pd.DataFrame(self.similarity_measure(X.values, X.values),
                 index = self.item_names)
        else:
            self.item_counts = X
            self.similarity_df = pd.DataFrame(self.similarity_measure(X, X),
                 index = index)
            self.item_names = self.similarity_df.index

        
    def get_recommendations(self, item, n=5):
        '''
        Returns the top n items related to the item passed in
        INPUT:
            item    - STRING - Name of item in the original DataFrame 
            n       - INT    - Number of top related items to return 
        OUTPUT:
            items - List of the top n related item names

        For a given item find the n most similar items to it (this can be done using the similarity matrix created in the fit method)
        '''
        return self.item_names[self.similarity_df.loc[item].values.argsort()[-(n+1):-1]].values[::-1]


    def get_user_profile(self, items):
        '''
        Takes a list of items and returns a user profile. A vector representing the likes of the user.
        INPUT: 
            items  -   LIST - list of movie names user likes / has seen

        OUTPUT: 
            user_profile - NP ARRAY - array representing the likes of the user 
                    The columns of this will match the columns of the trained on matrix
    

        Using the list of items liked by the user create a profile which will be a 1 x number of features array.  This should be the addition of the values for all liked item features (you can choose how to normalize if you think it is needed)
        '''
        user_profile = np.zeros(self.item_counts.shape[1])
        for item in items:
            user_profile += self.item_counts.loc[item].values

        return user_profile


    def get_user_recommendation(self, items, n=5):
        '''
        Takes a list of movies user liked and returns the top n items for that user

        INPUT 
            items  -   LIST - list of movie names user likes / has seen
            n -  INT - number of items to return

        OUTPUT 
            items - LIST - n recommended items

    
        Make use of the get_user_profile method to create a user profile that will be used to get the similarity to all items and recommend the top n.
        '''
        num_items = len(items)
        user_profile = self.get_user_profile(items)

        user_sim =  self.similarity_measure(self.item_counts, user_profile.reshape(1,-1))

        return self.item_names[user_sim[:,0].argsort()[-(num_items+n):-num_items]].values[::-1]

In [63]:
rec = ItemRecommender()
count_df = pd.DataFrame(count_matrix.todense(), index=indices.values)

In [64]:
rec.fit(count_df)

In [67]:
print(rec.get_recommendations('Toy Story (1995)', n=10))

['Rugrats Movie, The (1998)' 'Chicken Run (2000)' 'Saludos Amigos (1943)'
 'American Tail, An (1986)' 'American Tail: Fievel Goes West, An (1991)'
 "Bug's Life, A (1998)" 'Adventures of Rocky and Bullwinkle, The (2000)'
 'Aladdin and the King of Thieves (1996)' 'Toy Story 2 (1999)'
 'Goofy Movie, A (1995)']


## TFIDF

In [84]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genre'])
indices = pd.Series(movies.movie)

In [85]:
rec_tfidf = ItemRecommender()
tfidf_df = pd.DataFrame(tfidf_matrix.todense(), index=indices.values)

In [86]:
rec_tfidf.fit(tfidf_df)

In [89]:
print(rec_tfidf.get_recommendations(148, n=10))

[ 822 3350 2588  375 3347 2590 2592  827  826  824]


### Load in the ALS model's yhat_validation set

In [82]:
yhat_validation_als = pd.read_csv('data/yhat_validation_df_ALS.csv')

In [83]:
yhat_validation_als.head()

Unnamed: 0,user,movie,rating,timestamp,prediction
0,673,148,5,2000-11-30 21:47:04,
1,1242,148,3,2000-11-22 16:19:36,
2,1069,148,2,2000-11-23 02:05:35,
3,1605,148,2,2000-11-22 21:57:01,2.220402
4,1150,148,2,2000-11-22 06:38:26,


In [93]:
# For the first NaN, get the similar movies as that movie

In [118]:
similar_movies = rec_tfidf.get_recommendations(148, n=5)
similar_movies

array([ 822, 3350, 2588,  375, 3347])

In [119]:
# Then go through the top 10 similar movies and average them to get average for that type of movie

In [120]:
total = 0
count = 0
for each_movie in similar_movies:
    if training[training['movie']==each_movie]['rating'].shape[0] != 0:
        total += training[training['movie']==each_movie]['rating'].sum()
        count += training[training['movie']==each_movie]['rating'].shape[0]
print(total/count)

3.934156378600823


Using this algorithm, we would have predicted a rating of 3.93, 1.05 off from the actual. Not great, not terrible.

In [123]:
# What about user recommendations?

In [None]:
# For the first NaN, get the user's movies that it has seen.

In [129]:
# What about getting a user profile?
#profile = rec_tfidf.get_user_profile(['The Godfather','The Godfather: Part II'])

rec_tfidf.get_user_recommendation(training[training['user']==1605]['movie'])

array([ 166, 1273, 1276, 2685, 2684])

In [132]:
training[training['movie']==1273]

Unnamed: 0,user,movie,rating,timestamp
1033,6036,1273,4,2000-04-26 01:59:15
6010,5990,1273,5,2000-04-27 20:50:13
8869,5966,1273,4,2000-04-29 02:28:05
12982,5937,1273,4,2000-05-01 20:53:39
24600,5848,1273,3,2000-05-08 10:37:36
...,...,...,...,...
627766,1699,1273,3,2000-11-20 09:12:22
628989,1671,1273,4,2000-11-20 09:24:38
630269,1680,1273,2,2000-11-20 09:48:27
632516,1658,1273,5,2000-11-20 10:37:12


In [135]:
users[users['user']==402]

Unnamed: 0,user,sex,age,occupation,zip_code
401,402,M,25,11,55427
