In [1]:
from abc import ABC, abstractmethod
from datetime import datetime
from datetime import timedelta
from collections import defaultdict
from timeit import default_timer
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import math
import random
from random import sample

In [2]:
class TimeConstraint:

    def __init__(self, end_dt, start_dt=None):
        """
        When end_dt is only given, system will have a max time constraint only.

        When end_dt and start_dt are given, system will have beginning end ending boundary.

        :param end_dt: The maximum limit of the time constraint.
        :param start_dt: The minimum limit of the time constraint.
            Always set start_dt to None if you change the object from time_bin to max_limit.
        """
        self.end_dt = end_dt
        self.start_dt = start_dt

    def is_valid_time_bin(self) -> bool:
        """
        Check whether this TimeConstraint object represents a valid time bin.
        """
        if self.is_time_bin() and (self._end_dt > self._start_dt):
            return True
        return False

    def is_valid_max_limit(self) -> bool:
        """
        Check whether this TimeConstraint represents a valid max time limit.
        """
        if (self._end_dt is not None) and (self._start_dt is None):
            return True

    def is_time_bin(self) -> bool:
        if (self._start_dt is not None) and (self._end_dt is not None):
            return True
        return False

    # Comparing TimeConstraints

    def __eq__(self, other):
        if other is None:
            return False
        return self._start_dt == other.start_dt and self._end_dt == other.end_dt

    def __ne__(self, other):
        if other is None:
            return False
        return self._start_dt != other.start_dt or self._end_dt != other.end_dt

    # Properties

    @property
    def end_dt(self):
        return self._end_dt

    @end_dt.setter
    def end_dt(self, value):
        self._end_dt = value

    @property
    def start_dt(self):
        return self._start_dt

    @start_dt.setter
    def start_dt(self, value):
        self._start_dt = value

    # Printing TimeConstraints

    def __repr__(self):
        return f"(start = {self._start_dt}, end= {self._end_dt})"

    def __str__(self):
        return f"(start = {self._start_dt}, end= {self._end_dt})"


In [3]:
class Cache:

    def __init__(self,
                 is_ratings_cached=False,
                 ratings=None,
                 is_movies_cached=False,
                 movies=None,
                 is_movie_ratings_cached=False,
                 movie_ratings=None,
                 is_user_movie_matrix_cached=False,
                 user_movie_matrix=None,
                 is_user_correlations_cached=False,
                 user_correlations=None,
                 min_common_elements=5,
                 use_avg_ratings_cache=True):
        """ Cached data is only valid when the boolean specifier is True """

        # 30% performance
        self.is_ratings_cached = is_ratings_cached
        self.ratings = ratings

        # 7 fold performance gain on 'movie' related queries
        self.is_movies_cached = is_movies_cached
        self.movies = movies

        self.is_movie_ratings_cached = is_movie_ratings_cached
        self.movie_ratings = movie_ratings

        self.is_user_movie_matrix_cached = is_user_movie_matrix_cached
        self.user_movie_matrix = user_movie_matrix

        self.is_user_correlations_cached = is_user_correlations_cached
        self.user_correlations = user_correlations

        self.min_common_elements = min_common_elements

        # if use avg ratings cache, on average 10 fold performance gain
        self.use_avg_ratings_cache = use_avg_ratings_cache
        if self.use_avg_ratings_cache:
            self.avg_user_ratings = self.create_user_avg_rating_cache()
        else:
            self.avg_user_ratings = None

    def create_user_avg_rating_cache(self):
        if self.is_ratings_cached:
            data = self.ratings
        else:
            data = self.movie_ratings
        return data.groupby('user_id')[['rating']].mean()

    def get_user_corrs(self, min_common_elements, time_constraint=None):
        """
        If cached returns the cache, else none
        :param min_common_elements: min common element in between users in order them to become neighbours
        :param time_constraint: used in temporal caches only, None in this context
        :return: user correlation matrix if cache found, else None
        """
        if self.is_user_correlations_cached:
            if self.min_common_elements == min_common_elements:
                return self.user_correlations
        return None

    # Properties
    @property
    def ratings(self):
        return self._ratings

    @ratings.setter
    def ratings(self, value):
        self._ratings = value

    @property
    def movies(self):
        return self._movies

    @movies.setter
    def movies(self, value):
        self._movies = value

    @property
    def movie_ratings(self):
        return self._movie_ratings

    @movie_ratings.setter
    def movie_ratings(self, value):
        self._movie_ratings = value

    @property
    def user_movie_matrix(self):
        return self._user_movie_matrix

    @user_movie_matrix.setter
    def user_movie_matrix(self, value):
        self._user_movie_matrix = value

    @property
    def user_correlations(self):
        return self._user_correlations

    @user_correlations.setter
    def user_correlations(self, value):
        self._user_correlations = value

    @property
    def min_common_elements(self):
        return self._min_common_elements

    @min_common_elements.setter
    def min_common_elements(self, value):
        self._min_common_elements = value


class TemporalCache(Cache):

    def __init__(self,
                 time_constraint: TimeConstraint,
                 is_ratings_cached=False,
                 ratings=None,
                 is_movies_cached=False,
                 movies=None,
                 is_movie_ratings_cached=False,
                 movie_ratings=None,
                 is_user_movie_matrix_cached=False,
                 user_movie_matrix=None,
                 is_user_correlations_cached=False,
                 user_correlations=None,
                 min_common_elements=5,
                 use_avg_ratings_cache=True,
                 use_bulk_corr_cache=True):

        super().__init__(is_ratings_cached=is_ratings_cached,
                         ratings=ratings,
                         is_movies_cached=is_movies_cached,
                         movies=movies,
                         is_movie_ratings_cached=is_movie_ratings_cached,
                         movie_ratings=movie_ratings,
                         is_user_movie_matrix_cached=is_user_movie_matrix_cached,
                         user_movie_matrix=user_movie_matrix,
                         is_user_correlations_cached=is_user_correlations_cached,
                         user_correlations=user_correlations,
                         min_common_elements=min_common_elements,
                         use_avg_ratings_cache=use_avg_ratings_cache)

        self.time_constraint = time_constraint
        self.use_bulk_corr_cache = use_bulk_corr_cache
        self.user_corrs_in_bulk = None

    def is_temporal_cache_valid(self):
        # No TimeConstraint, valid
        if self._time_constraint is None:
            return True
        # Bin TimeConstraint or Max Limit TimeConstraint, valid
        if self._time_constraint.is_valid_time_bin() or self._time_constraint.is_valid_max_limit():
            return True
        # Else, Not Valid
        return False

    def get_user_corrs_from_bulk(self, min_common_elements, time_constraint, bin_size):
        if ((self.user_corrs_in_bulk is None) or (self.user_corrs_in_bulk is None)
                or (time_constraint is None) or self.min_common_elements != min_common_elements):
            return None

        if time_constraint.is_valid_max_limit():
            return self.user_corrs_in_bulk.get(time_constraint.end_dt.year)

        if bin_size == -1:
            return None

        bins = self.user_corrs_in_bulk.get(bin_size)
        if bins is not None:
            return bins.get(time_constraint.start_dt.year)

    def get_user_corrs(self, min_common_elements, time_constraint=None):
        """
        If cached returns the cache, else none

        :param min_common_elements: min common element in between users in order them to become neighbours
        :param time_constraint: time constraint on user correlations
        :return: user correlation matrix if cache found, else None
        """
        if self.is_user_correlations_cached:
            if self.time_constraint == time_constraint and self.min_common_elements == min_common_elements:
                return self.user_correlations
        return None

    def set_user_corrs(self, user_corrs, min_common_elements, time_constraint):
        # Only set when caching is open for user_correlations
        if self.is_user_correlations_cached:
            self._time_constraint = time_constraint
            self.min_common_elements = min_common_elements
            self.user_correlations = user_corrs

    @property
    def time_constraint(self):
        return self._time_constraint

    @time_constraint.setter
    def time_constraint(self, value):
        self._time_constraint = value


In [4]:
class Accuracy:

    @staticmethod
    def rmse(predictions) -> float:
        if type(predictions) is pd.DataFrame:
            number_of_predictions = 0
            sum_of_square_differences = 0.0
            for row in predictions.itertuples(index=False):
                # row[1] : actual rating, row[0] : prediction
                prediction = row[0]
                if prediction != 0:
                    sum_of_square_differences += (row[1] - prediction) ** 2
                    number_of_predictions += 1
            return sum_of_square_differences / number_of_predictions if number_of_predictions != 0 else 0
        elif type(predictions) is list:
            number_of_predictions = 0
            sum_of_square_differences = 0.0
            for prediction, actual in predictions:
                if prediction != 0:
                    sum_of_square_differences += (actual - prediction) ** 2
                    number_of_predictions += 1
            return sum_of_square_differences / number_of_predictions if number_of_predictions != 0 else 0
        return 0

In [5]:
class Dataset(ABC):
    @staticmethod
    @abstractmethod
    def load():
        """ Every subclass must provide static load method"""
        pass


class MovieLensDataset(Dataset):
    def __init__(self,
                 ratings_col_names=('user_id', 'item_id', 'rating', 'timestamp'),
                 ratings_path=r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv',
                 movies_col_names=('item_id', 'title', 'genres'),
                 movies_path=r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv',
                 is_ratings_cached=True,
                 is_movies_cached=True):
        Dataset.__init__(self)
        self.is_ratings_cached = is_ratings_cached
        self.is_movies_cached = is_movies_cached
        self.ratings = MovieLensDataset.load_ratings(ratings_path,
                                                     ratings_col_names) if self.is_ratings_cached else None
        self.movies = MovieLensDataset.load_movies(movies_path,
                                                   movies_col_names) if self.is_movies_cached else None

    @staticmethod
    def load_movies(movies_path,
                    movies_col_names=('item_id', 'title', 'genres')):
        if not os.path.isfile(movies_path) or not movies_col_names:
            return None

        # read movies
        movies = pd.read_csv(movies_path, sep=',', header=1, names=movies_col_names)

        # Extract Movie Year
        movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
        movies.year = pd.to_datetime(movies.year, format='%Y')
        movies.year = movies.year.dt.year  # As there are some NaN years, resulting type will be float (decimals)

        # Remove year part from the title
        movies.title = movies.title.str[:-7]

        return movies

    @staticmethod
    def load_ratings(ratings_path,
                     ratings_col_names=('user_id', 'item_id', 'rating', 'timestamp')):
        if not os.path.isfile(ratings_path) or not ratings_col_names:
            return None

        # read ratings
        ratings = pd.read_csv(ratings_path, sep=',', header=1, names=ratings_col_names)

        # Convert timestamp into readable format
        ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

        return ratings

    @staticmethod
    def create_movie_ratings(ratings, movies):
        return pd.merge(ratings, movies, on='item_id')

    @staticmethod
    def load(ratings_col_names=('user_id', 'item_id', 'rating', 'timestamp'),
             ratings_path=r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv',
             movies_col_names=('item_id', 'title', 'genres'),
             movies_path=r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv'
             ):
        # Load movies
        movies = MovieLensDataset.load_movies(movies_path=movies_path, movies_col_names=movies_col_names)
        # Load ratings
        ratings = MovieLensDataset.load_ratings(ratings_path=ratings_path, ratings_col_names=ratings_col_names)

        # Merge the ratings and movies
        movie_ratings = pd.merge(ratings, movies, on='item_id')

        return movie_ratings

In [6]:
class TemporalPearson:

    def __init__(self, cache: TemporalCache, time_constraint: TimeConstraint = None, min_common_elements: int = 5):
        self.time_constraint = time_constraint
        self.cache = cache
        self.min_common_elements = min_common_elements
        #from .trainset import TrainsetUser, TrainsetMovie
        self.trainset_user = TrainsetUser(cache=self.cache)
        self.trainset_movie = TrainsetMovie(cache=self.cache)

    def mean_centered_pearson(self, user_id, movie_id, k_neighbours: pd.DataFrame) -> float:
        """
        Calculate Mean Centered Prediction

        :param user_id: user of interest
        :param movie_id: the movie's rating is the one we we want to predict
        :param k_neighbours: k nearest neighbours in DataFrame where index user_id, column correlation in between.
        :return: Prediction rating
        """
        # If a movie with movie_id not exists, predict 0
        if self.trainset_movie.get_movie(movie_id=movie_id).empty:
            return 0

        if k_neighbours is None or k_neighbours.empty:
            return 0

        user_avg_rating = self.trainset_user.get_user_avg(user_id=user_id)

        weighted_sum = 0.0
        sum_of_weights = 0.0
        for neighbour_id, data in k_neighbours.iterrows():
            # Get each neighbour's correlation 'user_id' and her rating to 'movie_id'
            neighbour_corr = data['correlation']
            neighbour_rating = self.trainset_movie.get_movie_rating(movie_id=movie_id, user_id=neighbour_id)
            # If the neighbour doesnt give rating to the movie_id, pass this around of the loop
            if neighbour_rating == 0:
                continue
            neighbour_avg_rating = self.trainset_user.get_user_avg(user_id=neighbour_id)
            neighbour_mean_centered_rating = neighbour_rating - neighbour_avg_rating
            # Calculate Weighted sum and sum of weights
            weighted_sum += neighbour_mean_centered_rating * neighbour_corr
            sum_of_weights += neighbour_corr

        # Predict
        if sum_of_weights != 0:
            prediction_rating = user_avg_rating + (weighted_sum / sum_of_weights)
        else:
            prediction_rating = 0  # In this case, none of the neighbours have given rating to 'the movie'

        return prediction_rating

    def get_corr_matrix(self, bin_size=-1):
        user_corrs = None
        # if valid cache found, try to get user corrs from there
        if self.cache.is_temporal_cache_valid():
            # First check user-correlations
            user_corrs = self.cache.get_user_corrs(self.min_common_elements, self.time_constraint)
            if user_corrs is not None:
                return user_corrs
            # Then check bulk-user-correlations
            user_corrs = self.cache.get_user_corrs_from_bulk(time_constraint=self.time_constraint,
                                                             min_common_elements=self.min_common_elements,
                                                             bin_size=bin_size)
            if user_corrs is not None:
                return user_corrs

        # here, if cache not found or no cache match

        # Create user correlations
        user_corrs = TemporalPearson.create_user_corrs(movie_ratings=self.cache.movie_ratings,
                                                       time_constraint=self.time_constraint,
                                                       min_common_elements=self.min_common_elements)
        # Cache the user_corrs
        self.cache.set_user_corrs(user_corrs=user_corrs,
                                  min_common_elements=self.min_common_elements,
                                  time_constraint=self.time_constraint)

        return user_corrs

    @staticmethod
    def create_user_corrs(movie_ratings, time_constraint: TimeConstraint, min_common_elements):
        # by default movie_ratings is for no time constraint
        # with these controls change the time constraint of the movie_ratings
        if time_constraint is not None:
            if time_constraint.is_valid_max_limit():
                movie_ratings = movie_ratings[movie_ratings.timestamp < time_constraint.end_dt]
            elif time_constraint.is_valid_time_bin():
                movie_ratings = movie_ratings[(movie_ratings.timestamp >= time_constraint.start_dt)
                                              & (movie_ratings.timestamp < time_constraint.end_dt)]

        user_movie_matrix = movie_ratings.pivot_table(index='title', columns='user_id', values='rating')
        return user_movie_matrix.corr(method="pearson", min_periods=min_common_elements)

    def cache_user_corrs_in_bulk_for_max_limit(self, time_constraint: TimeConstraint, min_year, max_year):
        """
        Cache user correlations by changing year of the time_constraint
        for each year in between min_year and max_year(not included)

        :param time_constraint: time_constraint apply
        :param min_year: start of the range
        :param max_year: end of the range
        """

        if self.cache.use_bulk_corr_cache:
            if time_constraint is not None and time_constraint.is_valid_max_limit():
                self.cache.user_corrs_in_bulk = dict()
                for year in range(min_year, max_year):
                    time_constraint.end_dt = time_constraint.end_dt.replace(year=year)
                    corrs = TemporalPearson.create_user_corrs(self.cache.movie_ratings, time_constraint,
                                                              self.min_common_elements)
                    self.cache.user_corrs_in_bulk[year] = corrs
            else:
                raise Exception("Trying to cache user correlations in bulk for max_limit "
                                "but start time is not max_limit!")
        else:
            raise Exception("Trying to create bulk corr cache when use_bulk_corr_cache is False")

    def cache_user_corrs_in_bulk_for_time_bins(self, time_constraint: TimeConstraint, min_year, max_year,
                                               min_time_bin_size=2, max_time_bin_size=10):
        if self.cache.use_bulk_corr_cache:
            if time_constraint is not None and time_constraint.is_valid_time_bin():
                del self.cache.user_corrs_in_bulk    # invalidate old cache
                self.cache.user_corrs_in_bulk = dict()
                for time_bin_size in range(min_time_bin_size, max_time_bin_size):
                    self.cache.user_corrs_in_bulk[time_bin_size] = dict()
                    for shift in range(0, time_bin_size):
                        curr_year = min_year + shift
                        while (curr_year + time_bin_size) < max_year:
                            time_constraint = TimeConstraint(start_dt=datetime(curr_year, 1, 1),
                                                             end_dt=datetime(curr_year + time_bin_size, 1, 1))
                            corrs = TemporalPearson.create_user_corrs(self.cache.movie_ratings,
                                                                      time_constraint,
                                                                      self.min_common_elements)
                            self.cache.user_corrs_in_bulk[time_bin_size][curr_year] = corrs
                            curr_year += time_bin_size
        else:
            raise Exception("Trying to create bulk corr cache when use_bulk_corr_cache is False")

    @property
    def time_constraint(self):
        return self._time_constraint

    @time_constraint.setter
    def time_constraint(self, value):
        self._time_constraint = value


In [7]:
class TrainsetUser:

    def __init__(self, cache: Cache):
        """
        :param cache: Input cache must have movie_ratings not None !
        """
        self.cache = cache

        if not self.cache.is_movie_ratings_cached:
            raise Exception("'movie_ratings' has not been cached !")

    def get_users(self):
        """
        Get list of unique 'user_id's

        Since MovieLens Have 'user_id's from 0 to 610 without any missing user, for now sending that directly
        Uncomment the other lines later

        :return: the ids of the users found in movie_ratings
        """
        #
        # if self.cache.is_ratings_cached:
        #     data = self.cache.ratings
        # else:
        #     data = self.cache.movie_ratings
        #
        # return pd.unique(data['user_id'])
        return range(0, 611)

    def get_active_users(self, n=10) -> pd.DataFrame:
        """
        Get Users in sorted order where the first one is the one who has given most ratings.

        :param n: Number of users to retrieve.
        :return: user DataFrame with index of 'user_id' and columns of ['mean_rating', 'No_of_ratings'] .
        """

        if self.cache.is_ratings_cached:                         # 30% faster than other choice
            data = self.cache.ratings
        else:
            data = self.cache.movie_ratings

        active_users = pd.DataFrame(data.groupby('user_id')['rating'].mean())
        active_users['No_of_ratings'] = pd.DataFrame(data.groupby('user_id')['rating'].count())
        active_users.sort_values(by=['No_of_ratings'], ascending=False, inplace=True)
        active_users.columns = ['mean_rating', 'No_of_ratings']
        return active_users.head(n)

    def get_random_users(self, n=1):
        """
        Get list of random n number of 'user_id's

        :param n: Number of random users
        :return: List of random 'user_id's
        """

        return random.choices(population=self.get_users(), k=n)

    def get_user_ratings(self, user_id: int) -> pd.DataFrame:
        """
        Get all the ratings given by of the chosen users

        :param user_id: id of the chosen user
        :return: Ratings given by the 'user_id'
        """
        if self.cache.is_ratings_cached:                         # 2.2x faster than other choice
            data = self.cache.ratings
        else:
            data = self.cache.movie_ratings

        return data.loc[data['user_id'] == user_id]

    def get_user_avg(self, user_id: int):

        if self.cache.use_avg_ratings_cache:
            avg_user_rating = self.cache.avg_user_ratings.loc[user_id]
            return avg_user_rating[0] if not avg_user_rating.empty else 0

        user_ratings = self.get_user_ratings(user_id=user_id)
        return user_ratings.rating.mean() if not user_ratings.empty else 0

    def get_timestamp(self, user_id: int, movie_id: int):
        """
        Get the timestamp of the given rating

        :param user_id: the users whose rating timestamp we are searching
        :param movie_id: id of the movie that the user gave the rating
        :return: if found the datetime object otherwise None
        """

        if self.cache.is_ratings_cached:
            data = self.cache.ratings
        else:
            data = self.cache.movie_ratings

        timestamp = data.loc[(data['user_id'] == user_id) & (data['item_id'] == movie_id)]
        return timestamp.values[0, 3] if not timestamp.empty else None

    def get_first_timestamp(self):
        if self.cache.is_ratings_cached:
            data = self.cache.ratings
        else:
            data = self.cache.movie_ratings
        return data['timestamp'].min()

    def get_user_avg_timestamp(self, user_id: int):
        user_ratings = self.get_user_ratings(user_id=user_id)
        return user_ratings.timestamp.mean() if not user_ratings.empty else 0

    # TODO: Later, create TemporalDatasetUser, and put this method into that one
    def get_user_ratings_at(self, user_id: int, at: datetime) -> pd.DataFrame:
        """
        Get user ratings up until the given datetime
        :param user_id: id of the chosen user
        :param at: only those ratings that are before this date will be taken into account
        :return: Ratings given by the 'user_id' before given datetime
        """

        if self.cache.is_ratings_cached:
            data = self.cache.ratings
        else:
            data = self.cache.movie_ratings

        return data.loc[(data['user_id'] == user_id) & (data.timestamp < at)]

    # TODO: Later, create TemporalDatasetUser, and put this method into that one
    def get_user_avg_at(self, user_id: int, at: datetime):
        user_ratings = self.get_user_ratings_at(user_id, at)
        return user_ratings.rating.mean() if not user_ratings.empty else 0


class TrainsetMovie:

    def __init__(self, cache: Cache):
        """
        :param cache: Input cache must have movie_ratings not None !
        """
        self.cache = cache

        if not self.cache.is_movie_ratings_cached:
            raise Exception("'movie_ratings' has not been cached !")

    def get_movie(self, movie_id) -> pd.DataFrame:
        """
        Get Movie Record

        :return: DataFrame which contains the given 'movie_id's details. If not found empty DataFrame .
        """
        if self.cache.is_movies_cached:
            return self.cache.movies.loc[self.cache.movies['item_id'] == movie_id]
        return self.cache.movie_ratings.loc[self.cache.movie_ratings['item_id'] == movie_id]

    def get_movies(self):
        """
        Get list of unique 'item_id's or in other words the movies.

        :return: List of movie ids
        """

        if self.cache.is_movies_cached:
            return self.cache.movies['item_id'].values.tolist()

        return pd.unique(self.cache.movie_ratings['item_id'])

    def get_random_movies(self, n=10):
        """
        Get list of random n number of 'item_id's or in other words the movies

        :param n: Number of random movies
        :return: List of random 'movie_id's
        """
        return random.choices(population=self.get_movies(), k=n)

    def get_movies_watched(self, user_id: int, time_constraint: TimeConstraint = None) -> pd.DataFrame:
        """
        Get all the movies watched by the chosen user.

        :param user_id: the user that we want to get the movies he-she has watched.
        :param time_constraint: type of the time constraint.
        :return: DataFrame of all movies watched with 'item_id', 'rating' columns
        """

        movie_ratings = self.cache.movie_ratings

        if time_constraint is None:
            return movie_ratings.loc[(movie_ratings['user_id'] == user_id)][['item_id', 'rating']]

        if time_constraint.is_valid_max_limit():
            return movie_ratings.loc[(movie_ratings['user_id'] == user_id)
                                     & (movie_ratings.timestamp < time_constraint.end_dt)][['item_id', 'rating']]
        elif time_constraint.is_valid_time_bin():
            return movie_ratings.loc[(movie_ratings['user_id'] == user_id)
                                     & (movie_ratings.timestamp >= time_constraint.start_dt)
                                     & (movie_ratings.timestamp < time_constraint.end_dt)][['item_id', 'rating']]
        raise Exception("Undefined time_constraint is given!")

    def get_movie_rating(self, movie_id: int, user_id: int) -> int:
        """
        Get the movie rating taken by the chosen user

        :param movie_id: the movie chosen movie's id
        :param user_id: id of the chosen user
        :return: Rating given by user. If not found, returns 0
        """

        if self.cache.is_ratings_cached:
            data = self.cache.ratings
        else:
            data = self.cache.movie_ratings

        movie_rating = data.loc[(data['user_id'] == user_id) & (data['item_id'] == movie_id)]
        return movie_rating.values[0, 2] if not movie_rating.empty else 0

    def get_random_movie_watched(self, user_id: int) -> int:
        """
        Get random movie id watched.

        :param user_id: User of interest
        :return:  movie_id or item_id of the random movie watched by the user.
                  In case non-valid user_id supplied then returns 0
        """
        movies_watched = self.get_movies_watched(user_id=user_id)
        return random.choice(movies_watched['item_id'].values.tolist()) if not movies_watched.empty else 0

    def get_random_movies_watched(self, user_id: int, n=2) -> pd.DataFrame:
        """
        Get random n movies watched by the user. Only use when n > 2

        Use get_random_movie_watched if n=1 since that one 2 fold faster.

        :param user_id: the user of interest
        :param n: number of random movies to get
        :return: DataFrame of movies, if none found then empty DataFrame
        """
        movies_watched = self.get_movies_watched(user_id=user_id)
        return random.choices(population=movies_watched['item_id'].values.tolist(),
                              k=n) if not movies_watched.empty else movies_watched

    def get_random_movie_per_user(self, user_id_list):
        """
        Get random movie for each user given in the 'user_id_list'

        :param user_id_list: List of valid user_ids
        :return: List of (user_id, movie_id) tuples
                where each movie_id is randomly chosen from watched movies of the user_id .
                In case any one of the user_id's supplies invalid, then the movie_id will be 0 for that user.
        """
        user_movie_list = list()
        for user_id in user_id_list:
            user_movie_list.append((user_id, self.get_random_movie_watched(user_id=user_id)))
        return user_movie_list


class Trainset:
    def __init__(self, cache: TemporalCache, min_common_elements: int = 5):
        self.cache = cache
        self.min_common_elements = min_common_elements
        self.similarity = TemporalPearson(time_constraint=None, cache=self.cache)

        if not self.cache.is_movie_ratings_cached:
            raise Exception("'movie_ratings' has not been cached !")

        self.trainset_movie = TrainsetMovie(cache=cache)
        self.trainset_user = TrainsetUser(cache=cache)

        # if caching is allowed, create user correlations cache
        self.similarity.get_corr_matrix()

    def predict_movies_watched(self, user_id, n=10, k=10, time_constraint=None) -> pd.DataFrame:
        """

        :param user_id: user of interest
        :param n: Number of movies to predict
        :param k: k neighbours to take into account
        :param time_constraint: When calculating k neighbours,
                                only those that comply to time_constraints will be taken into account.
        :return: DataFrame of Predictions where columns = ['prediction', 'rating'] index = 'movie_id'
        """
        # Get all movies watched by a user
        movies_watched = self.trainset_movie.get_movies_watched(user_id=user_id)

        if movies_watched.empty:
            return None

        predictions = list()
        number_of_predictions = 0
        for row in movies_watched.itertuples(index=False):
            prediction = self.predict_movie(user_id=user_id, movie_id=row[0],
                                            time_constraint=time_constraint, k=k)
            if number_of_predictions == n:
                break
            predictions.append([prediction, row[1], row[0]])
            number_of_predictions += 1

        predictions_df = pd.DataFrame(predictions, columns=['prediction', 'rating', 'movie_id'])
        predictions_df.movie_id = predictions_df.movie_id.astype(int)
        return predictions_df.set_index('movie_id')

    def predict_movie(self, user_id, movie_id, k=10, time_constraint=None, bin_size=-1):
        prediction = self.similarity.mean_centered_pearson(user_id=user_id,
                                                           movie_id=movie_id,
                                                           k_neighbours=
                                                           self.get_k_neighbours(user_id, k=k,
                                                                                 time_constraint=time_constraint,
                                                                                 bin_size=bin_size)
                                                           )        
        return prediction if prediction <= 5 else 5

    def get_k_neighbours(self, user_id, k=20, time_constraint: TimeConstraint = None, bin_size=-1):
        """
        :param user_id: the user of interest
        :param k: number of neighbours to retrieve
        :param time_constraint: time constraint when choosing neighbours
        :param bin_size: Used when using time_bins, in order to select bin from cache
        :return: Returns the k neighbours and correlations in between them. If no neighbours found, returns None
                 DataFrame which has 'Correlation' column and 'user_id' index.
        """
        self.similarity.time_constraint = time_constraint
        user_corr_matrix = self.similarity.get_corr_matrix(bin_size=bin_size)

        # Exit if matrix is None, no user found in self.cache.movie_ratings, something is wrong
        if user_corr_matrix is None:
            return None

        # Get the chosen 'user_id's correlations
        user_correlations = user_corr_matrix.get(user_id)
        if user_correlations is None:
            return None

        # Drop any null, if found
        user_correlations.dropna(inplace=True)
        # Create A DataFrame from not-null correlations of the 'user_id'
        users_alike = pd.DataFrame(user_correlations)
        # Rename the only column to 'correlation'
        users_alike.columns = ['correlation']

        # Sort the user correlations in descending order
        #     so that first one is the most similar, last one least similar
        users_alike.sort_values(by='correlation', ascending=False, inplace=True)

        # Eliminate Correlation to itself by deleting first row,
        #     since biggest corr is with itself it is in first row
        return users_alike.iloc[1:k+1]


In [8]:

class Evaluator:

    def __init__(self, trainset: Trainset):
        self.trainset = trainset

    def evaluate_best_max_year_in_bulk(self, n,
                                       n_users, n_movies, k=10,
                                       min_year=-1,
                                       max_year=-1) -> dict:
        """
        Evaluate and collect data about best max year constraint which can be put instead of no constraint.

        This method calls 'evaluate_best_max_year_constraint' method 'n' times.
        Takes required precautions for bulk calling.

        :param n: Number of runs that we run the evaluate_best_max_year_constraint() method
        :param n_users: Number of users to check
        :param n_movies: Number of movies per user to check
        :param k: Number of neighbours of each user to take into account when making prediction
        :param min_year: First year to evaluate
        :param max_year: Last year to evaluate
        :return: (no_constrain_rmse_data, best_year_constraint_results)
        """
        if min_year == -1:
            min_year = self.trainset.trainset_user.get_first_timestamp().year

        if max_year == -1:
            max_year = datetime.now().year

        time_constraint = TimeConstraint(end_dt=datetime(year=min_year, month=1, day=1))
        # Create cache if bulk_corr_cache is allowed
        self.trainset.similarity.cache_user_corrs_in_bulk_for_max_limit(time_constraint,
                                                                        min_year=min_year,
                                                                        max_year=max_year)
        
        run_results = dict()
        for i in range(n):
            run_results[i] = self.evaluate_best_max_year_constraint(n_users=n_users, n_movies=n_movies, k=k,
                                                                    min_year=min_year, max_year=max_year,
                                                                    create_cache=False,)

        return run_results

    def evaluate_best_max_year_constraint(self, n_users, n_movies, k,
                                          max_diff=0.1,
                                          min_year=-1, max_year=-1,
                                          create_cache=True) -> defaultdict:
        """
        Evaluate the max_year constraint for evaluate_max_year_constraint method.

        :param max_diff: maximum difference between rmse when no constraint and with given year constraint.
        :param n_users: Number of users to evaluate
        :param n_movies: Number of movies per user to evaluate
        :param k: Number of neighbours of each user to take into account when making prediction
        :param min_year: First year to evaluate
        :param max_year: Last year to evaluate
        :param create_cache: create cache before running. For bulk callers.
        :return: Votes for years where each year got its vote
                 when rmse is less than 'max_diff' in between no constraint and year constraint
        """

        if min_year == -1:
            min_year = self.trainset.trainset_user.get_first_timestamp().year

        if max_year == -1:
            max_year = datetime.now().year

        if n_users > 600:
            user_list = self.trainset.trainset_user.get_users()  # No need to random selection, get all users
        else:
            user_list = self.trainset.trainset_user.get_random_users(n=n_users)  # Select random n users

        # Calculate RMSE With No Constraint
        no_constraint_data = dict()
        for user_id in user_list:
            rmse = Accuracy.rmse(self.trainset.predict_movies_watched(user_id, n_movies, k))
            no_constraint_data[user_id] = rmse

        # # Calculate RMSE With Time Constraint

        # Cache all years before processing
        time_constraint = TimeConstraint(end_dt=datetime(year=min_year, month=1, day=1))
        # Create cache if bulk_corr_cache is allowed
        if create_cache:
            self.trainset.similarity.cache_user_corrs_in_bulk_for_max_limit(time_constraint,
                                                                            min_year=min_year,
                                                                            max_year=max_year)
        # Votes to years is stored inside time_constraint_data
        time_constraint_data = defaultdict(int)
        for year in range(min_year, max_year):
            time_constraint.end_dt = time_constraint.end_dt.replace(year=year)

            for user_id in user_list:
                rmse = Accuracy.rmse(self.trainset.predict_movies_watched(user_id=user_id, n=n_movies, k=k,
                                                                          time_constraint=time_constraint))
                if abs(rmse - no_constraint_data[user_id]) < max_diff:
                    time_constraint_data[year] += 1

        return time_constraint_data

    def evaluate_max_year_constraint(self, n_users, n_movies, k, time_constraint):
        """
        Compare given time_constraint with normal where no constraint exists.

        Time constraint is of type max_year which means the system will be set to a certain year.

        :param n_users: Number of users to evaluate
        :param n_movies: Number of movies per user to evaluate
        :param k: Number of neighbours to take into account when making movie prediction
        :param time_constraint: Time constraint which will be applied.
        :return: DataFrame of results which contains rmse with constraint and no constraint, as well as runtime.
        """
        trainset = self.trainset
        data = list()

        for i in range(n_users):
            # Get Random User
            user_id = random.randint(1, 610)
            # Predict movies for user and record runtime
            st = default_timer()
            rmse = Accuracy.rmse(
                trainset.predict_movies_watched(user_id=user_id, n=n_movies, k=k, time_constraint=None))
            r1 = default_timer() - st
            # Predict movies with time_constraint for user and record runtime
            st = default_timer()
            time_constrained_rmse = Accuracy.rmse(
                trainset.predict_movies_watched(user_id=user_id, n=n_movies, k=k, time_constraint=time_constraint))
            r2 = default_timer() - st
            # Save iteration data
            data.append([user_id, rmse, r1, time_constrained_rmse, r2])

        data = pd.DataFrame(data)
        data.columns = ['user_id', 'rmse', 'runtime1', 'temporal_rmse', 'runtime2']
        data.set_index('user_id', inplace=True)
        return data

    def evaluate_time_bins_in_bulk(self, n, n_users, k=10,
                                   min_year=-1,
                                   max_year=-1,
                                   min_time_bin_size=2, max_time_bin_size=10):
        """
        Evaluate time bins and return the results.

        This method calls 'evaluate_time_bins' method 'n' times. Takes required precautions for bulk calling.

        :param n: Number of runs
        :param n_users: Number of users
        :param k: Number of neighbours will be used when making prediction
        :param min_year: First year to start when taking time bins
        :param max_year: When to stop when taking time bins, last is not included.
        :param min_time_bin_size: Minimum bin size in years
        :param max_time_bin_size: Maximum bin size in years
        :return: Evaluation results
        """
        if min_year == -1:
            min_year = self.trainset.trainset_user.get_first_timestamp().year

        if max_year == -1:
            max_year = datetime.now().year

        # Cache all years before processing
        time_constraint = TimeConstraint(start_dt=datetime(year=min_year, month=1, day=1),
                                         end_dt=datetime(year=max_year, month=1, day=1))
        self.trainset.similarity.cache_user_corrs_in_bulk_for_time_bins(time_constraint,
                                                                        min_year=min_year,
                                                                        max_year=max_year,
                                                                        min_time_bin_size=min_time_bin_size,
                                                                        max_time_bin_size=max_time_bin_size)

        run_results = dict()
        for i in range(n):
            run_results[i] = self.evaluate_time_bins(n_users=n_users, k=k, min_year=min_year, max_year=max_year,
                                                     min_time_bin_size=min_time_bin_size,
                                                     max_time_bin_size=max_time_bin_size,
                                                     create_cache=False)

        return run_results

    def evaluate_time_bins(self, n_users, k, min_year=-1, max_year=-1,
                           min_time_bin_size=2, max_time_bin_size=10,
                           create_cache=True) -> dict:
        """

        :param n_users: Number of users
        :param k: Number of neighbours will be used when making prediction
        :param min_year: First year to start when taking time bins
        :param max_year: When to stop when taking time bins, last is not included.
        :param min_time_bin_size: Minimum bin size in years
        :param max_time_bin_size: Maximum bin size in years
        :param create_cache: Create cache before calling time bins. For bulk callers.
        :return:
        """
        trainset = self.trainset

        if min_year == -1:
            min_year = self.trainset.trainset_user.get_first_timestamp().year

        if max_year == -1:
            max_year = datetime.now().year

        if n_users > 600:
            user_list = trainset.trainset_user.get_users()
        else:
            user_list = trainset.trainset_user.get_random_users(n=n_users)
        user_movie_list = trainset.trainset_movie.get_random_movie_per_user(user_list)
        data = dict()

        result = list()

        if create_cache:
            # Cache all years before processing
            time_constraint = TimeConstraint(start_dt=datetime(year=min_year, month=1, day=1),
                                             end_dt=datetime(year=max_year, month=1, day=1))
            self.trainset.similarity.cache_user_corrs_in_bulk_for_time_bins(time_constraint,
                                                                            min_year=min_year,
                                                                            max_year=max_year,
                                                                            min_time_bin_size=min_time_bin_size,
                                                                            max_time_bin_size=max_time_bin_size)

        # Take each bins where first bin 'min_time_bin_size' years, last one 'max_time_bin_size - 1' years
        for time_bin_size in range(min_time_bin_size, max_time_bin_size):
            # Shift each time_bin starting with 0 years up until (time_bin-1) years
            for shift in range(0, time_bin_size):
                curr_year = min_year + shift
                predictions = list()
                start_time = default_timer()
                # Scan and make predictions for all the time_bins
                while (curr_year + time_bin_size) < max_year:
                    for user_id, movie_id in user_movie_list:
                        p = trainset.predict_movie(user_id=user_id, movie_id=movie_id, k=k,
                                                   time_constraint=TimeConstraint(start_dt=datetime(curr_year, 1, 1),
                                                                                  end_dt=datetime(curr_year+time_bin_size, 1, 1)),
                                                   bin_size=time_bin_size)
                        # if prediction has been done successfully
                        if p != 0:
                            r = trainset.trainset_movie.get_movie_rating(movie_id=movie_id, user_id=user_id)
                            # Append (prediction, actual_rating)
                            predictions.append((p, r))
                    curr_year += time_bin_size
                runtime = default_timer() - start_time
                bin_rmse = Accuracy.rmse(predictions=predictions)
                iteration_results = {"bin_size": time_bin_size,
                                     "start_year": min_year + shift,
                                     "predictions": predictions,
                                     "rmse": bin_rmse,
                                     "runtime": runtime
                                     }
                result.append(iteration_results)

        data['result'] = result
        return data

In [9]:
c = TemporalCache(time_constraint=None, 
                  is_ratings_cached=True,
                  is_movies_cached=True,
                  is_movie_ratings_cached=True,
                  ratings=MovieLensDataset.load_ratings(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv'),
                  movies=MovieLensDataset.load_movies(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv'),
                  movie_ratings=MovieLensDataset.load(),
                  is_user_correlations_cached=True,
                  use_bulk_corr_cache=True)

In [10]:
t = Trainset(cache=c, min_common_elements=5)
e = Evaluator(trainset=t)

## Timebin Based Predictions

In [11]:
dataset = MovieLensDataset(is_movies_cached=True, is_ratings_cached=True)

In [12]:
def find_timebin_corr(ratings, timebin, avg_rating, user_id, timebin_time_constraint):
    curr_bin = t.trainset_movie.get_movies_watched(user_id, timebin_time_constraint)
    merged = curr_bin.merge(timebin, on='item_id')
    common_elements = len(merged)
    
    t.trainset_user.get_user_avg_at(user_id,)
    user_avg_rating = t.trainset_user.get_user_avg(user_id)
    numenator = ((merged['rating_x'] - avg_rating) * (merged['rating_y'] - user_avg_rating)).sum()
    denominator = math.sqrt(((merged['rating_x'] - avg_rating) ** 2).sum())
    denominator *= math.sqrt(((merged['rating_y'] - user_avg_rating) ** 2).sum())
    pearson = numenator / denominator
    
    return pearson, common_elements

In [13]:
def get_random_movies_watched_from_timebin(timebin, user_id: int, n=2) -> pd.DataFrame:
    """
    Get random n movies watched by the user. 
    """
    return sample(timebin.index.to_list(),n)

In [14]:
def get_timebin_size(tc: TimeConstraint):
    return abs((tc.start_dt - tc.end_dt).days)

In [15]:
def get_movies_watched(ratings, user_id: int, time_constraint: TimeConstraint = None) -> pd.DataFrame:
    """
    Get all the movies watched by the chosen user.

    :param user_id: the user that we want to get the movies he-she has watched.
    :param time_constraint: type of the time constraint.
    :return: DataFrame of all movies watched with 'item_id', 'rating' columns
    """

    movie_ratings = ratings
    
    if time_constraint is None:
        return movie_ratings.loc[(movie_ratings['user_id'] == user_id)][['item_id', 'rating', 'timestamp']].set_index('item_id')

    if time_constraint.is_valid_max_limit():
        return movie_ratings.loc[(movie_ratings['user_id'] == user_id)
                                 & (movie_ratings.timestamp < time_constraint.end_dt)][['item_id', 'rating', 'timestamp']].set_index('item_id')
    elif time_constraint.is_valid_time_bin():
        return movie_ratings.loc[(movie_ratings['user_id'] == user_id)
                                 & (movie_ratings.timestamp >= time_constraint.start_dt)
                                 & (movie_ratings.timestamp < time_constraint.end_dt)][['item_id', 'rating', 'timestamp']].set_index('item_id')
    raise Exception("Undefined time_constraint is given!")

In [16]:
def get_timebin(ratings, user_id: int, time_constraint) -> pd.DataFrame:
    return get_movies_watched(ratings, user_id=user_id, time_constraint=time_constraint)

In [17]:
def get_timebin_neighbours(ratings, user_id, time_constraint, k:int):
    # Get the user timebin
    timebin = get_timebin(ratings, user_id, time_constraint)
    
    # Count number of common ratings with other users
    userlist = [0 for i in range(611)]
    for movie_id in timebin.index.values.tolist():
        users_who_watched = ratings.loc[(ratings['item_id'] == 3) & (ratings['timestamp'] < time_constraint.end_dt)][['user_id']].values.tolist()
        for user_who_watched in users_who_watched:
            userlist[user_who_watched[0]] += 1
    
    # Filter k of them
    neighbour_id_list = []
    for i in range(0, 611):
        if userlist[i] > k:
            neighbour_id_list.append(i)
    return neighbour_id_list

In [18]:
get_timebin_neighbours(dataset.ratings, 443, TimeConstraint(start_dt=datetime(year=2017, month=1, day=1), end_dt=datetime.now()), k = 5)

[1,
 6,
 19,
 32,
 42,
 43,
 44,
 51,
 58,
 64,
 68,
 91,
 100,
 102,
 116,
 117,
 150,
 151,
 169,
 179,
 217,
 226,
 240,
 269,
 270,
 288,
 289,
 294,
 302,
 307,
 308,
 321,
 330,
 337,
 368,
 410,
 414,
 448,
 456,
 470,
 477,
 480,
 492,
 501,
 544,
 552,
 555,
 588,
 590,
 594,
 599,
 608]

In [39]:
def get_prevalent_genre(timebin):
    genre_voting = defaultdict(int)
    for row in timebin.itertuples(index=False):
        genres = row[4]
        for genre in genres.split("|"): 
            genre_voting[genre] += 1
    return max(genre_voting.items(), key=lambda a: a[1])

In [48]:
def get_timebin_length(movies, timebin) -> timedelta:
    timebin = pd.merge(timebin, movies, left_index=True, right_on='item_id')
    genre = get_prevalent_genre(timebin)   # like ('action', 10)
    first_item_ts = timebin.iloc[-1][1]
    last_item_ts = None
    for i in range(len(timebin)):
        if 'Action' in timebin.iloc[-i][4].split("|"):
            last_item_ts = timebin.iloc[-i][1]
    return first_item_ts - last_item_ts

In [83]:
def get_most_similar_timebins(ratings, movies, user_id, time_constraint, k, n, corr_treshold):
    """
    k: Komşunun genel ortak film sayısı
    n: timebin içindeki ortak film sayısı
    """
    # Get the user timebin
    timebin = get_timebin(ratings, user_id, time_constraint)
    
    # Get timebin length using most prevalent genre
    timebin_length = int(get_timebin_length(movies, timebin).total_seconds())
    
    if timebin_length < 86_400_000:
        timebin_length *= 10_000
    
    # Define limits
    first_timestamp = ratings['timestamp'].min()
    max_timebin_length = int(abs(first_timestamp - time_constraint.end_dt).total_seconds())
    
    # Neighbours contains users who has watched the movies in common
    neighbours = get_timebin_neighbours(ratings, user_id, time_constraint, k)
    
    # Avg rating of the user
    avg_rating = t.trainset_user.get_user_avg(user_id)
    
    data = list()
    start_time = default_timer()
    for timebin_size in range(timebin_length, max_timebin_length, timebin_length):
        for shift in range(0, timebin_size, timebin_size//10):    # make 10 start time shift
            start_dt = first_timestamp + timedelta(seconds=shift)    # assign start time 
            curr_dt = start_dt
            while (curr_dt + timedelta(seconds=timebin_length)) < time_constraint.end_dt:
                for user_id in neighbours:   
                    end_dt = curr_dt + timedelta(days=timebin_length)
                    # find curr_timebin
                    timebin_time_constraint = TimeConstraint(start_dt=curr_dt, end_dt=end_dt)
                    corr, common_elements = find_timebin_corr(ratings, timebin=timebin, avg_rating=avg_rating, 
                                             user_id=user_id, timebin_time_constraint=timebin_time_constraint)
                    # if and only if more than n movies in common rated in the temporal time bin, get the correlations
                    if not math.isnan(corr) and common_elements > n and corr > corr_treshold:
                        data.append( (user_id, curr_dt, timebin_size, corr) )
                        print( (user_id, curr_dt, timebin_size, corr) )
                        
                curr_dt = end_dt                        #curr_dt + timedelta(days=timebin_size)
                #print(f"start={start_dt} -  curr={curr_dt}, shift={shift}, timebin_size ={timebin_size}")
    runtime = default_timer() - start_time
    print(f"runtime={runtime}")    
    return pd.DataFrame(data, columns=['user_id', 'start_dt', 'bin_size_in_days', 'pearson_corr'])

In [84]:
df = get_most_similar_timebins(dataset.ratings,dataset.movies, 443,
                               TimeConstraint(start_dt=datetime(year=2017, month=1, day=1), end_dt=datetime.now()), 5, 3, 0.5)

OverflowError: int too big to convert

In [22]:
df.sort_values(['bin_size_in_days', 'pearson_corr'], ascending=[True, False], inplace=True)

In [23]:
df

Unnamed: 0,user_id,start_dt,bin_size_in_days,pearson_corr
16,414,1999-08-25 18:36:55,1244,0.943269
48,414,1999-12-27 18:36:55,1244,0.943269
71,414,2000-04-29 18:36:55,1244,0.943269
242,414,1999-08-21 18:36:55,1244,0.943269
88,414,1997-04-05 18:36:55,1244,0.935812
...,...,...,...,...
946,217,1996-03-29 18:36:55,8708,-0.514511
953,308,1996-03-29 18:36:55,8708,-0.532620
942,91,1996-03-29 18:36:55,8708,-0.552446
941,68,1996-03-29 18:36:55,8708,-0.597608


In [24]:
df.head(25)

Unnamed: 0,user_id,start_dt,bin_size_in_days,pearson_corr
16,414,1999-08-25 18:36:55,1244,0.943269
48,414,1999-12-27 18:36:55,1244,0.943269
71,414,2000-04-29 18:36:55,1244,0.943269
242,414,1999-08-21 18:36:55,1244,0.943269
88,414,1997-04-05 18:36:55,1244,0.935812
111,414,1997-08-07 18:36:55,1244,0.935812
133,414,1997-12-09 18:36:55,1244,0.935812
156,414,1998-04-12 18:36:55,1244,0.935812
178,414,1998-08-14 18:36:55,1244,0.935812
200,414,1998-12-16 18:36:55,1244,0.935812


In [25]:
df.sort_values(by='pearson_corr', ascending=False).drop_duplicates('user_id').set_index('user_id')

Unnamed: 0_level_0,start_dt,bin_size_in_days,pearson_corr
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
414,1999-08-25 18:36:55,1244,0.943269
169,1997-08-08 18:36:55,4976,0.929164
555,1999-08-21 18:36:55,2488,0.922836
32,1996-03-29 18:36:55,2488,0.830366
288,1998-12-16 18:36:55,1244,0.788232
179,1996-03-29 18:36:55,6220,0.757158
590,2009-11-08 18:36:55,1244,0.69853
480,2006-06-13 18:36:55,2488,0.666486
43,1996-03-29 18:36:55,3732,0.66517
477,2000-04-29 18:36:55,3732,0.642239


In [26]:
tc = TimeConstraint(start_dt=datetime(year=2017, month=1, day=1), end_dt=datetime.now())
tb = get_timebin(dataset.ratings, user_id=443, time_constraint=tc)
tb

Unnamed: 0_level_0,rating,timestamp
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.0,2017-08-03 01:08:02
110,4.5,2017-08-03 01:07:57
260,4.0,2017-08-03 01:07:45
296,4.0,2017-08-03 01:07:47
318,5.0,2017-08-03 01:07:38
356,5.0,2017-08-03 01:07:41
608,5.0,2017-08-03 01:08:15
5952,5.0,2017-08-03 01:08:17
7153,5.0,2017-08-03 01:08:18
79132,1.0,2017-08-03 01:11:35


In [27]:
def predict_movies_watched(timebin, user_id, timeconstraint):
    movies_watched = timebin.index.to_list()
    predictions = list()
    for movie in movies_watched:
        prediction = t.predict_movie(user_id=user_id, movie_id=movie, k=10)
        if prediction != 0:
            actual = t.trainset_movie.get_movie_rating(movie_id=movie, user_id=user_id)
            predictions.append((prediction, actual))
    return predictions

In [28]:
predictions = predict_movies_watched(timebin=tb, user_id=443, timeconstraint=tc)
predictions

[(4.121621621621622, 4.5),
 (3.5412895265381175, 4.0),
 (3.9815086592962965, 4.0),
 (4.726583392125758, 5.0),
 (4.511206121155887, 5.0),
 (5, 5.0),
 (4.900062273990611, 5.0),
 (4.900062273990611, 5.0),
 (3.939462223865436, 1.0),
 (4.042003787226717, 5.0)]

In [29]:
Accuracy.rmse(predictions)

1.024577353020557

In [30]:
def get_timebin_neighbours_data(ratings, timebin, similar_timebins):
    data = defaultdict(list)
    for row in similar_timebins.itertuples(index=False):
        neighbour_id = row[0]
        start_dt = row[1]
        timebin_size = row[2]
        corr = row[3]
        end_dt = start_dt + timedelta(seconds=timebin_size)
        timebin_tc = TimeConstraint(start_dt=start_dt, end_dt=end_dt)
        neighbour_bin = get_timebin(ratings, user_id=neighbour_id, time_constraint=timebin_tc)
        merged_bin = pd.merge(timebin, neighbour_bin, left_index=True, right_index=True)
        for bin_row in merged_bin.itertuples(index=True):
            curr_movie = bin_row[0]
            neighbour_rating = bin_row[3]
            #print(bin_row[0], bin_row[1], bin_row[2], bin_row[3], bin_row[4])
            data[curr_movie].append( (neighbour_rating, corr) )
    return data

In [31]:
data = get_timebin_neighbours_data(dataset.ratings, timebin=tb, similar_timebins=df)

In [32]:
def predict_movies_watched_using_timebin_neighbours(data, user_id, min_neighbour_count=5):
    predictions = list()
    for movie_id, rating_corr_list in data.items():
        weighted_sum = 0
        weight_sum = 0
        count = 0
        for rating_corr in rating_corr_list:
            count += 1
            rating = rating_corr[0]
            corr = rating_corr[1]
            weighted_sum += rating * corr
            weight_sum += corr
        if count < min_neighbour_count:         # if less than min_neighbour_count neighbour found, pass
            continue
        prediction = weighted_sum / weight_sum
        actual = t.trainset_movie.get_movie_rating(movie_id=movie_id, user_id=user_id)
        predictions.append( (prediction, actual) )
    return predictions

In [33]:
predictions_using_timebins = predict_movies_watched_using_timebin_neighbours(data, 443)
predictions_using_timebins

[(3.9154652826623155, 4.0),
 (4.695140644695054, 4.5),
 (4.869577181383489, 4.0),
 (4.33078315429006, 4.0),
 (4.890852323386359, 5.0),
 (4.889431782559287, 5.0),
 (4.384031184453677, 5.0),
 (4.6369288051115065, 5.0),
 (4.436814972732457, 5.0)]

In [34]:
Accuracy.rmse(predictions_using_timebins)

0.19592912827023481

In [35]:
print(f"Normal: {Accuracy.rmse(predictions)} \t Timebin:{Accuracy.rmse(predictions_using_timebins)}")

Normal: 1.024577353020557 	 Timebin:0.19592912827023481


In [36]:
def compare_normal_and_timebin_predictions(ratings, user_id, time_constraint):
    tc = time_constraint
    timebin = get_timebin(dataset.ratings, user_id=user_id, time_constraint=tc)
    normal_rmse = Accuracy.rmse(predict_movies_watched(timebin=timebin, user_id=user_id, timeconstraint=tc))
    
    # k -> komşunun en az ortak film sayısı, n -> benzer timebin içindeki en az ortak film sayısı
    similar_timebins = get_most_similar_timebins(ratings, user_id,tc, k=5, n=3, corr_threshold=0.5)
    data = get_timebin_neighbours_data(ratings, timebin, similar_timebins)
    
    # En az 'min_neighbour_count' sayıda farklı timebin'den veri gelmediyse o filme tahmin yapma.
    predictions = predict_movies_watched_using_timebin_neighbours(data, user_id, min_neighbour_count=5)
    
    predictions_using_timebins = Accuracy.rmse(predictions)
    
    print(f"Normal RMSE: {normal_rmse} \t Timebin RMSE:{predictions_using_timebins}")

In [37]:
compare_normal_and_timebin_predictions(dataset.ratings, 443, TimeConstraint(start_dt=datetime(year=2017, month=1, day=1), end_dt=datetime.now()))

  # Remove the CWD from sys.path while we load stuff.


runtime=38.806789300000005
Normal RMSE: 1.024577353020557 	 Timebin RMSE:0.19592912827023407


In [38]:
compare_normal_and_timebin_predictions(dataset.ratings, 610, TimeConstraint(start_dt=datetime(year=2017, month=5, day=3), end_dt=datetime.now()))

  # Remove the CWD from sys.path while we load stuff.


runtime=38.046881299999995
Normal RMSE: 0.6604464424540253 	 Timebin RMSE:0.5938418606262376


In [44]:
def compare_normal_and_timebin_predictions_(ratings, n, s):
    """
    Compare n number of users
    """
    count = 0
    while count < n:
        # Rastgele kullanıcı seç
        user_id = random.randint(0,610)
        # Rastgele Filmini seç
        movie_id = t.trainset_movie.get_random_movie_watched(user_id)
        movie_ts = t.trainset_user.get_timestamp(user_id, movie_id)
        # Filmi izlemeden önceki s adet filmden olustan timebini al
        movies_watched_until_the_movie_ts = get_movies_watched(ratings, user_id, TimeConstraint(end_dt=movie_ts))
        if len(movies_watched_until_the_movie_ts) < s:
            continue
        else:
            count += 1
        # s inci filmin timestamp i ile baslayip, aradığımız movie_ts ile sonlanan veriyi timebin olarak al
        tc = TimeConstraint(start_dt=movies_watched_until_the_movie_ts.iloc[-s]['timestamp'], end_dt=movie_ts)
        timebin = get_timebin(dataset.ratings, user_id=user_id, time_constraint=tc)
        
        # Normal olarak, timebin içinde bulunan tüm filmlere, kullanıcının vereceği puanları tahmin et 
        normal_rmse = Accuracy.rmse(predict_movies_watched(timebin=timebin, user_id=user_id, timeconstraint=tc))

        # k -> komşunun en az ortak film sayısı, n -> benzer timebin içindeki en az ortak film sayısı
        similar_timebins = get_most_similar_timebins(ratings, user_id, tc, k=5, n=3)

        # Komşuların verilerini topla, bunu yaparken 0.5den aşşağı benzerliği olanları alma.
        data = get_timebin_neighbours_data(ratings, timebin, similar_timebins, corr_threshold=0.5)

        # En az 'min_neighbour_count' sayıda farklı timebin'den veri gelmediyse o filme tahmin yapma.
        predictions = predict_movies_watched_using_timebin_neighbours(data, user_id, min_neighbour_count=5)

        predictions_using_timebins = Accuracy.rmse(predictions)

        print(f"Normal RMSE: {normal_rmse} \t Timebin RMSE:{predictions_using_timebins}")

In [46]:
compare_normal_and_timebin_predictions_(dataset.ratings, n=100, s=10)

  # Remove the CWD from sys.path while we load stuff.


runtime=45.722531000000004
Normal RMSE: 0.1503932887861746 	 Timebin RMSE:0.40544615081910407
runtime=45.55319700000007
Normal RMSE: 0.09246261120125261 	 Timebin RMSE:0.449128347285121
runtime=71.1980082
Normal RMSE: 0.7753194471566357 	 Timebin RMSE:0.7137459934989867
runtime=44.4741798
Normal RMSE: 0.49461239586001154 	 Timebin RMSE:0.5545947973703461
runtime=44.71568979999995
Normal RMSE: 0.10601297823384664 	 Timebin RMSE:0.39214941667097863
runtime=56.30889410000009
Normal RMSE: 0.02934837375886094 	 Timebin RMSE:0.30137117670610913
runtime=90.01689240000007
Normal RMSE: 0.5093925950832184 	 Timebin RMSE:0.265049588203881


Exception: Undefined time_constraint is given!

In [49]:
mov = get_movies_watched(dataset.ratings, 583)

In [59]:
t.trainset_movie.get_random_movie_watched(610)

6764

In [60]:
t.trainset_movie.get_movie_rating(6764, 610)

3.0

In [61]:
t.trainset_user.get_timestamp(610, 6764)

Timestamp('2017-05-03 21:28:06')

In [54]:
random.randint(0,610)

548

In [62]:
ff = get_movies_watched(dataset.ratings, 610, TimeConstraint(end_dt=t.trainset_user.get_timestamp(610, 6764)))
ff

Unnamed: 0_level_0,rating,timestamp
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.0,2016-11-19 08:08:20
16,4.5,2016-11-19 07:56:11
32,4.5,2016-11-19 08:15:31
47,5.0,2016-11-19 08:57:33
50,4.0,2017-05-03 20:52:37
...,...,...
160527,4.5,2016-11-19 08:43:18
160836,3.0,2017-05-03 20:53:14
164179,5.0,2017-05-03 21:07:11
168252,5.0,2017-05-03 21:19:12


In [56]:
len(ff)

2

In [57]:
ff.iloc[-2]['timestamp']

Timestamp('2016-12-11 16:35:36')

In [84]:
fd = pd.merge(ff, dataset.movies, left_index=True, right_on='item_id').iloc[-50:]
fd

Unnamed: 0,rating,timestamp,item_id,title,genres,year
8571,4.0,2017-05-03 21:06:10,116823,The Hunger Games: Mockingjay - Part 1,Adventure|Sci-Fi|Thriller,2014.0
8577,3.0,2016-11-19 08:31:18,116977,Dumb and Dumber To,Comedy,2014.0
8589,3.5,2017-05-03 21:15:19,117529,Jurassic World,Action|Adventure|Drama|Sci-Fi|Thriller,2015.0
8635,4.5,2017-05-03 21:14:04,119145,Kingsman: The Secret Service,Action|Adventure|Comedy|Crime,2015.0
8646,3.5,2017-05-03 21:08:43,120466,Chappie,Action|Thriller,2015.0
8653,2.5,2017-05-03 21:10:42,120799,Terminator Genisys,Action|Adventure|Sci-Fi|Thriller,2015.0
8680,5.0,2017-05-03 21:04:04,122882,Mad Max: Fury Road,Action|Adventure|Sci-Fi|Thriller,2015.0
8682,4.5,2016-11-19 08:36:29,122886,Star Wars: Episode VII - The Force Awakens,Action|Adventure|Fantasy|Sci-Fi|IMAX,2015.0
8685,4.0,2016-11-19 08:03:26,122892,Avengers: Age of Ultron,Action|Adventure|Sci-Fi,2015.0
8688,3.5,2017-05-03 21:15:26,122900,Ant-Man,Action|Adventure|Sci-Fi,2015.0


In [85]:
fd

Unnamed: 0,rating,timestamp,item_id,title,genres,year
8571,4.0,2017-05-03 21:06:10,116823,The Hunger Games: Mockingjay - Part 1,Adventure|Sci-Fi|Thriller,2014.0
8577,3.0,2016-11-19 08:31:18,116977,Dumb and Dumber To,Comedy,2014.0
8589,3.5,2017-05-03 21:15:19,117529,Jurassic World,Action|Adventure|Drama|Sci-Fi|Thriller,2015.0
8635,4.5,2017-05-03 21:14:04,119145,Kingsman: The Secret Service,Action|Adventure|Comedy|Crime,2015.0
8646,3.5,2017-05-03 21:08:43,120466,Chappie,Action|Thriller,2015.0
8653,2.5,2017-05-03 21:10:42,120799,Terminator Genisys,Action|Adventure|Sci-Fi|Thriller,2015.0
8680,5.0,2017-05-03 21:04:04,122882,Mad Max: Fury Road,Action|Adventure|Sci-Fi|Thriller,2015.0
8682,4.5,2016-11-19 08:36:29,122886,Star Wars: Episode VII - The Force Awakens,Action|Adventure|Fantasy|Sci-Fi|IMAX,2015.0
8685,4.0,2016-11-19 08:03:26,122892,Avengers: Age of Ultron,Action|Adventure|Sci-Fi,2015.0
8688,3.5,2017-05-03 21:15:26,122900,Ant-Man,Action|Adventure|Sci-Fi,2015.0


In [25]:
sss = abs(datetime(2015,5,5) - datetime(2020,5,5))

In [33]:
sss.total_seconds()

157852800.0

In [35]:
p = sss + timedelta(seconds=10)

In [36]:
p.total_seconds()

157852810.0