# Recommender Class Abstraction

In [608]:
import pandas as pd
import numpy as np
import pgeocode
from datetime import datetime

In [311]:
data = pd.read_csv('./Data/data_content_based_v1.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [609]:
ratings = pd.read_csv("./Data/ratings_v2.csv")
movies = pd.read_csv("./Data/movies_v2.csv")
users = pd.read_csv('./Data/u.user_v2.csv')

# Other user data we need
watched_movies = ratings.groupby('userId').movieId.agg([list, 'size'])
preferred_genres = pd.read_csv('./Data/prefered_genre.csv')
black_sheeps = pd.read_csv('./Data/black_sheeps.csv').drop('Unnamed: 0', axis=1)
black_sheeps.black_sheep = black_sheeps.black_sheep.astype(bool)

In [610]:
ratings['time'] = ratings.timestamp.apply(lambda x: datetime.fromtimestamp(x).strftime("%d-%m-%Y %I:%M:%S"))
ratings['time'] = pd.to_datetime(ratings.time, format = "%d-%m-%Y %I:%M:%S")
ratings = ratings.drop('timestamp', axis=1)

In [682]:
def popularity(n):
    pop_data = ratings.groupby('movieId').rating.agg(['sum', 'mean']).reset_index()
    top_n_movies = pop_data['sum'].nlargest(10).index
    top_n_movies = pop_data.loc[top_n_movies].movieId
    result = movies.loc[movies.movieId.isin(top_n_movies)].merge(pop_data, on='movieId', how='inner')[['movieId', 'mean']]
    return pd.Series(result['mean'], index=result['movieId'])

def filter_time(by='all_time', num=1):
    today = ratings.time.max()
    if by == 'day': filter_date = today - pd.DateOffset(days=num)
    elif by == 'month': filter_date = today - pd.DateOffset(months=num)
    elif by == 'year': filter_date = today - pd.DateOffset(years=num)
    else: filter_date = ratings.time.min()
    return ratings[ratings.time > filter_date]

def popularity_dates(n, by='all_time', num=1):
    ratings = filter_time(ratings, by, num)
    return popularity(n)

def popularity_genre(n, by='all_time', num=1, genre='Comedy'): 
    global ratings
    if genre: ratings = ratings[ratings.movieId.isin(movies[movies[genre] == 1].index)]
    ratings = filter_time(by, num)
    return popularity(n)

In [683]:
class User():
    
    def __init__(self, userId):
        self.id = userId
        self.is_new = self._is_new_user()
        self.is_black_sheep = self._is_black_sheep()
        self.watched_movies = self._watched_movies()
    
    
    def _is_new_user(self):
        if self.id not in ratings.userId.unique():
            self.add_new_user()
            return True
        if watched_movies.loc[self.id, 'size'] < 5: # when does it stop being new? --> after our good model beats the new users model. 
            return True
        return False
        
        
    def _watched_movies(self): 
        return [] if self.is_new else watched_movies.loc[self.id, 'list']
    
    
    def _is_black_sheep(self):
        return False if self.is_new else black_sheeps.loc[black_sheeps.userId == self.id, 'black_sheep'].values[0]
    
    
    def add_new_user(self):
        global preferred_genres
        global users
        # Ask for Data
        print("Welcome! We are happy to have you as a new customer (and that finally you left that Netflix crap!)")
        print("Tell us a bit more about yourself so that we can make better recommendations (you can skip this if you like!) ")
        user_data = {'userId': self.id, 
                     'age': input("Age: "), 
                     'gender': input("Gender: [M, F]: "), 
                     'occupation': input("Occupation (head's up! it can only be one of: ['technician', 'other', 'writer', 'executive', 'administrator','student', 'lawyer', 'educator', 'scientist', 'entertainment', 'programmer', 'librarian', 'homemaker', 'artist', 'engineer','marketing', 'healthcare', 'retired', 'salesman', 'doctor']): "),
                     'zip_code': input("Zip Code (psssst! In practice we'd actually steal this info from you!): ")}
        user_pf = {'userId': self.id, 
                   'preferred_genre': input('Any preferred genres? (You can skip this if you like!): ')}
        print("Thank you! Enjoy watching our movies")
        
        # Get state, place and coordinates from postal code: 
        nomi = pgeocode.Nominatim('us')
        geolocation = nomi.query_postal_code(user_data['zip_code'])[['place_name', 'state_name', 'latitude', 'longitude']]
        user_data.update(dict(geolocation))
        
        # Add to User's Database: 
        users = users.append(user_data, ignore_index=True)
        users = users.apply(lambda y: y.apply(lambda x: np.nan if x == '' else x))
        preferred_genres = preferred_genres.append(user_pf, ignore_index=True)

In [684]:
class RecommenderMovie(): 
    
    def __init__(self):
        self.old_user_CF = joblib.load('./Models/old_user_CF.pkl')
        self.new_old_user_CB = joblib.load('./Models/new_old_user_CB.pkl')
        self.hybrid_weight_CF = 0.65
        self.new_user_popularity = popularity_genre
        self.black_sheep_CF = None
    
    # Predictions: 
    
    def show_recommendations(self, user):
        preds = self._predict(user)
        recommend = preds.reset_index().merge(movies, how='left', on='movieId').sort_values('predicted_rating', ascending=False)
        recommend.reset_index().apply(lambda movie: self._format_movie(movie), axis=1)
        
        
    def _format_movie(self, movie):
        genres = movie[5:][movie[5:] == 1].index
        print(f"\n{movie['index'] + 1}: {movie['title']} ({int(movie['year'])})\n\t {str([i for i in genres])[1:-1]}")
    
    
    def _predict(self, user):
        if user.is_new: 
            preds = self._predict_new_user(user)
        elif user.is_black_sheep:
            preds = self._predict_black_sheep(user)
        else: 
            preds = self._predict_old_user(user)
        preds.index.name = 'movieId'
        preds.name = 'predicted_rating'
        return preds.sort_values(ascending=False)[:10]
    
    
    def _predict_new_user(self, user):
        movie_ids, data_CB = self._preprocess_data_user(user, include_data_CF=False)
        if any(movie_ids): # We have info on new user
            preds_CB = self.new_old_user_CB.predict(data_CB)
            return pd.Series(preds_CB, index=movie_ids)
        time = input('Find the most popular movies! Filter by time: ')
        genre = input('\nFilter by genre: ')
        return self.new_user_popularity(10, 'years', time, genre)
        
    
    
    def _predict_black_sheep(self, user):
        pass 

    
    def _predict_old_user(self, user):
        # Make individual predictions after making the data to predict all items
        movie_ids, data_CF, data_CB = self._preprocess_data_user(user, include_data_CF=True)
        preds_CF = [self.old_user_CF.predict(userId, movieId).est for userId, movieId in zip(*data_CF)]
        preds_CB = self.new_old_user_CB.predict(data_CB)
        
        # Make hybrid model
        preds = [pred_CF*self.hybrid_weight_CF + pred_CB*(1 - self.hybrid_weight_CF) for pred_CF, pred_CB in zip(preds_CF, preds_CB)]
        return pd.Series(preds, index=movie_ids)

    
    def _preprocess_data_user(self, user, include_data_CF):
        rate_movies = sorted([i for i in ratings.movieId.unique() if i not in user.watched_movies])
        
        # CB Data
        data_CF = ([user.id]*len(rate_movies), rate_movies)
        
        # CB Data
        movies_processed = movies.drop('title', axis=1)[movies.movieId.isin(rate_movies)].sort_values('movieId')
        users_info = users.drop(['zip_code', 'place_name', 'userId'], axis=1)[users.userId == user.id] 
        if users_info.isna().sum().sum() != 0: return ([], []) # User does not give personal data
        data_CB = movies_processed.reset_index(drop=True).drop('movieId', axis=1).join(users_info).ffill()
        
        return (rate_movies, data_CF, data_CB) if include_data_CF else (rate_movies, data_CB)

In [685]:
user = User(-10)

Welcome! We are happy to have you as a new customer (and that finally you left that Netflix crap!)
Tell us a bit more about yourself so that we can make better recommendations (you can skip this if you like!) 


Age:  
Gender: [M, F]:  
Occupation (head's up! it can only be one of: ['technician', 'other', 'writer', 'executive', 'administrator','student', 'lawyer', 'educator', 'scientist', 'entertainment', 'programmer', 'librarian', 'homemaker', 'artist', 'engineer','marketing', 'healthcare', 'retired', 'salesman', 'doctor']):  
Zip Code (psssst! In practice we'd actually steal this info from you!):  
Any preferred genres? (You can skip this if you like!):  


Thank you! Enjoy watching our movies


In [686]:
user.is_new

True

In [687]:
user.is_black_sheep

False

In [688]:
recommender = RecommenderMovie()

In [689]:
recommender.show_recommendations(user)

Find the most popular movies! Filter by time:  4
Filter by genre:  Romance


1: GoldenEye (1995)
	 'Action', 'Adventure', 'Thriller'

2: Apocalypse Now (1979)
	 'Action', 'Drama', 'War'

3: Ferris Bueller's Day Off (1986)
	 'Comedy'

4: X-Men (2000)
	 'Action', 'Adventure', 'Sci-Fi'

5: Donnie Darko (2001)
	 'Drama', 'Mystery', 'Sci-Fi', 'Thriller'

6: Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
	 'Adventure', 'Children', 'Fantasy'

7: Minority Report (2002)
	 'Action', 'Crime', 'Mystery', 'Sci-Fi', 'Thriller'

8: Catch Me If You Can (2002)
	 'Crime', 'Drama'

9: Lord of the Rings: The Return of the King, The (2003)
	 'Action', 'Adventure', 'Drama', 'Fantasy'

10: Kill Bill: Vol. 2 (2004)
	 'Action', 'Drama', 'Thriller'



Limitations: 
- Work with denormalized data to make it easy for us, get a centralized data source. This should not be done as it is harder to mantaing data integrity, and slower as we need to perform a lot of aggregations to make some computations. We did it to make it easier for us
- Run black sheeps alogirhtm again after we have a significantly large number of new users
- Black sheeps: include all the data. because we will run content based as well. 

Another thing we need to do is: 
- Evaluate black sheep and new user recommendations
- ... 