In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import *
from statistics import mean
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pandas.core.common import SettingWithCopyWarning
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import pickle
import sys
from sys import exc_info

In [None]:
user_rating_data_df = pd.read_csv('./datasets/user_score_data.csv', usecols=['user_id', 'mal_id', 'rating'], 
                                  dtype={'user_id':'int32', 'mal_id':'int32', 'rating':'float32'})

## Filtering the datset that only contain 4+ rating

In [None]:
ratings = user_rating_data_df[user_rating_data_df['rating'] >= 4.0]

In [None]:
fav_movies = ratings.loc[:, ['user_id', 'mal_id']]

In [None]:
fav_movies = ratings.reset_index(drop = True)

In [None]:
fav_movies.T

In [None]:
fav_movies.to_csv('./datasets/filtered_ratings.csv')

In [None]:
def moviesList(users, users_data):
    # users = a list of users IDs
    # users_data = a dataframe of users favourite movies or users watched movies
    users_movies_list = []
    for u in users:
        users_movies_list.append(str(list(users_data[users_data['user_id'] == u]['mal_id'])).split('[')[1].split(']')[0])
    return users_movies_list

In [None]:
users = np.unique(fav_movies['user_id'])
print(users.shape) 

In [None]:
users_movies_list = moviesList(users, fav_movies)

In [None]:
def SparseMatrix(list_of_str):
    # list_of_str = A list, which contain strings of users favourite movies separate by comma ",".
    # It will return us sparse matrix and feature names on which sparse matrix is defined 
    # i.e. name of movies in the same order as the column of sparse matrix
    cv = CountVectorizer(token_pattern = r'[^\,\ ]+', lowercase = False)
    sparseMatrix = cv.fit_transform(list_of_str)
    return sparseMatrix.toarray(), cv.get_feature_names()

In [None]:
sparseMatrix, feature_names = SparseMatrix(users_movies_list)

In [None]:
df_sparse = pd.DataFrame(sparseMatrix, index = users, columns = feature_names)
df_sparse

In [None]:
first_6_users_SM = fav_movies[fav_movies['user_id'].isin(users[:6])].sort_values('user_id')
first_6_users_SM.T

In [None]:
df_sparse.loc[np.unique(first_6_users_SM['user_id']), list(map(str, np.unique(first_6_users_SM['mal_id'])))]

## K-Mean CLustering

In [None]:
kmeans = KMeans(n_clusters=15, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
clusters = kmeans.fit_predict(sparseMatrix)

In [None]:
users_cluster = pd.DataFrame(np.concatenate((users.reshape(-1,1), clusters.reshape(-1,1)), axis = 1), columns = ['user_id', 'Cluster'])
users_cluster.T

In [None]:
def clustersMovies(users_cluster, users_data):
    clusters = list(users_cluster['Cluster'])
    each_cluster_movies = list()
    for i in range(len(np.unique(clusters))):
        users_list = list(users_cluster[users_cluster['Cluster'] == i]['user_id'])
        users_movies_list = list()
        for user in users_list:    
            users_movies_list.extend(list(users_data[users_data['user_id'] == user]['mal_id']))
        users_movies_counts = list()
        users_movies_counts.extend([[movie, users_movies_list.count(movie)] for movie in np.unique(users_movies_list)])
        each_cluster_movies.append(pd.DataFrame(users_movies_counts, columns=['mal_id', 'Count']).sort_values(by = ['Count'], ascending = False).reset_index(drop=True))
    return each_cluster_movies
cluster_movies = clustersMovies(users_cluster, fav_movies)

In [None]:
cluster_movies[1].T

In [None]:
for i in range(15):
    len_users = users_cluster[users_cluster['Cluster'] == i].shape[0]
    print('Users in Cluster ' + str(i) + ' -> ', len_users)

## Get user favourite movie list

In [None]:
def userFav(user_id, users_data):
    return list(users_data[users_data['user_id'] == user_id]['mal_id'])

In [None]:
def fixClusters(clusters_movies_dataframes, users_cluster_dataframe, users_data, smallest_cluster_size = 11):
    # clusters_movies_dataframes: will be a list which will contain each dataframes of each cluster movies
    each_cluster_movies = clusters_movies_dataframes.copy()
    users_cluster = users_cluster_dataframe.copy()
    # Let convert dataframe in each_cluster_movies to list with containing only movies IDs
    each_cluster_movies_list = [list(df['mal_id']) for df in each_cluster_movies]
    # First we will prepair a list which containt lists of users in each cluster -> [[Cluster 0 Users], [Cluster 1 Users], ... ,[Cluster N Users]] 
    usersInClusters = list()
    total_clusters = len(each_cluster_movies)
    for i in range(total_clusters):
        usersInClusters.append(list(users_cluster[users_cluster['Cluster'] == i]['user_id']))
    uncategorizedUsers = list()
    i = 0
    # Now we will remove small clusters and put their users into another list named "uncategorizedUsers"
    # Also when we will remove a cluster, then we have also bring back cluster numbers of users which comes after deleting cluster
    for j in range(total_clusters):
        if len(usersInClusters[i]) < smallest_cluster_size:
            uncategorizedUsers.extend(usersInClusters[i])
            usersInClusters.pop(i)
            each_cluster_movies.pop(i)
            each_cluster_movies_list.pop(i)
            users_cluster.loc[users_cluster['Cluster'] > i, 'Cluster'] -= 1
            i -= 1
        i += 1
    for user in uncategorizedUsers:
        elemProbability = list()
        user_movies = userFav(user, users_data)
        if len(user_movies) == 0:
            print(user)
        user_missed_movies = list()
        for movies_list in each_cluster_movies_list:
            count = 0
            missed_movies = list()
            for movie in user_movies:
                if movie in movies_list:
                    count += 1
                else:
                    missed_movies.append(movie)
            elemProbability.append(count / len(user_movies))
            user_missed_movies.append(missed_movies)
        user_new_cluster = np.array(elemProbability).argmax()
        users_cluster.loc[users_cluster['user_id'] == user, 'Cluster'] = user_new_cluster
        if len(user_missed_movies[user_new_cluster]) > 0:
            each_cluster_movies[user_new_cluster] = each_cluster_movies[user_new_cluster].append([{'mal_id': new_movie, 'Count': 1} for new_movie in user_missed_movies[user_new_cluster]], ignore_index = True)
    return each_cluster_movies, users_cluster

In [None]:
movies_df_fixed, clusters_fixed = fixClusters(cluster_movies, users_cluster, fav_movies, smallest_cluster_size = 6)

In [None]:
class saveLoadFiles:
    def save(self, filename, data):
        try:
            file = open('datasets/' + filename + '.pkl', 'wb')
            pickle.dump(data, file)
        except:
            err = 'Error: {0}, {1}'.format(exc_info()[0], exc_info()[1])
            print(err)
            file.close()
            return [False, err]
        else:
            file.close()
            return [True]
    def load(self, filename):
        try:
            file = open('datasets/' + filename + '.pkl', 'rb')
        except:
            err = 'Error: {0}, {1}'.format(exc_info()[0], exc_info()[1])
            print(err)
            file.close()
            return [False, err]
        else:
            data = pickle.load(file)
            file.close()
            return data
    def load_Dataset(self):
        return self.load('clusters_movies_dataset')
    def save_Dataset(self, data):
        return self.save('clusters_movies_dataset', data)
    def load_Clusters(self):
        return self.load('users_clusters')
    def save_Clusters(self, data):
        return self.save('users_clusters', data)

In [None]:
saveLoadFile = saveLoadFiles()
load_movies_list, load_users_clusters = saveLoadFile.load_Dataset(), saveLoadFile.load_Clusters()

## Creating Class Function for the recommendation system

In [None]:
class userRequestedFor:
    def __init__(self, user_id, users_data):
        self.users_data = users_data.copy()
        self.user_id = user_id
        # Find User Cluster
        users_cluster = saveLoadFiles().load_Clusters()
        self.user_cluster = int(users_cluster[users_cluster['user_id'] == self.user_id]['Cluster'])
        # Load User Cluster Movies Dataframe
        self.movies_list = saveLoadFiles().load_Dataset()
        self.cluster_movies = self.movies_list[self.user_cluster] # dataframe
        self.cluster_movies_list = list(self.cluster_movies['mal_id']) # list
    def updatedFavouriteMoviesList(self, new_movie_Id):
        if new_movie_Id in self.cluster_movies_list:
            self.cluster_movies.loc[self.cluster_movies['mal_id'] == new_movie_Id, 'Count'] += 1
        else:
            self.cluster_movies = self.cluster_movies.append([{'user_id': new_movie_Id, 'Count': 1}], ignore_index=True)
        self.cluster_movies.sort_values(by = ['Count'], ascending = False, inplace= True)
        self.movies_list[self.user_cluster] = self.cluster_movies
        saveLoadFiles().save_Dataset(self.movies_list)

    def recommendMostGenres(self):
        try:
            user_movies = userFav(self.user_id, self.users_data)
            cluster_movies_list = self.cluster_movies_list.copy()
            for user_movie in user_movies:
                if user_movie in cluster_movies_list:
                    cluster_movies_list.append(user_movie)
            return [True, cluster_movies_list]
        except KeyError:
            err = "User history does not exist"
            print(err)
            return [False, err]
        except:
            err = 'Error: {0}, {1}'.format(exc_info()[0], exc_info()[1])
            print(err)
            return [False, err]

## Merging both dataframes together

In [None]:
animes_df = pd.read_csv('./datasets/anime_data.csv', usecols=['mal_id', 'title', 'genres'])
animes_df.head(3)

In [None]:
df = fav_movies.merge(animes_df, on = 'mal_id')
df = df.drop_duplicates(['user_id','title'])

In [None]:
df

## Movie histories based on user_ID

In [None]:
title = list(df.loc[df['user_id'] == 55]['title'])
if title != []:
    print('Movie title: ', title, ', Genres: ', end = '')
    genres = ast.literal_eval(df.loc[df['user_id'] == 55]['genres'].values[0].split('[')[1].split(']')[0])
    for genre in genres:
        print(genre[:7], ' ', end = '')
    print('')

## Implement users recommendation based on their genres and anime history

In [None]:
rec = userRequestedFor(2, fav_movies).recommendMostGenres()[1]
for movie in rec[:1]:
    title = list(df.loc[df['user_id'] == 2]['title'])
    if title != []:
        genres = ast.literal_eval(df.loc[df['user_id'] == 2]['genres'].values[0].split('[')[1].split(']')[0])
        for genre in genres:
            print(genre[:20], ' ', end = '')
        print() 

## The top anime that the user is most likely to watch can be obtained

In [None]:
anime_rec = df['title'].values

anime_rec_list = []
for rec in anime_rec:
    if rec not in anime_rec_list:
        anime_rec_list.append(rec)

In [None]:
anime_rec_list[:1]