In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import *
from statistics import mean
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pandas.core.common import SettingWithCopyWarning
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import pickle
import sys
from sys import exc_info

## Loading the data

Source and Tutorial: https://asdkazmi.medium.com/ai-movies-recommendation-system-with-clustering-based-k-means-algorithm-f04467e02fcd

In [37]:
user_rating = pd.read_csv('./datasets/user_score_data.csv', usecols=['user_id', 'mal_id', 'rating'], 
                                  dtype={'user_id':'int32', 'mal_id':'int32', 'rating':'float32'})

In [38]:
print('Shape of ratings dataset is: ', user_rating.shape, '\n')
print('Max values in dataset are \n', user_rating.max(), '\n')
print('Min values in dataset are \n', user_rating.min(), '\n') 

Shape of ratings dataset is:  (948898, 3) 

Max values in dataset are 
 user_id     2195.0
mal_id     42913.0
rating        10.0
dtype: float64 

Min values in dataset are 
 user_id    1.0
mal_id     1.0
rating     0.0
dtype: float64 



## Pick only data user that have 4+ rating

In [39]:
user_rating = user_rating[user_rating['rating'] >= 4.0]

In [40]:
users_list = np.unique(user_rating['user_id'])[:100]
ratings = user_rating.loc[user_rating['user_id'].isin(users_list)]

In [41]:
fav_movies = ratings.loc[:, ['user_id', 'mal_id']]

In [42]:
fav_movies = ratings.reset_index(drop = True)

In [43]:
fav_movies.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30359,30360,30361,30362,30363,30364,30365,30366,30367,30368
user_id,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mal_id,29978.0,2467.0,28789.0,34881.0,101.0,713.0,36032.0,656.0,1485.0,17901.0,...,4224.0,33352.0,10015.0,15489.0,21595.0,16576.0,1195.0,11319.0,1840.0,3712.0
rating,6.0,10.0,6.0,6.0,10.0,8.0,8.0,5.0,10.0,6.0,...,10.0,10.0,9.0,7.0,7.0,8.0,10.0,9.0,10.0,9.0


In [44]:
fav_movies.to_csv('./datasets/filtered_ratings.csv')

In [45]:
def userMovieList(users, users_data):
    # users = a list of users IDs
    # users_data = a dataframe of users favourite movies or users watched movies
    users_movies_list = []
    for user in users:
        users_movies_list.append(str(list(users_data[users_data['user_id'] == user]['mal_id'])).split('[')[1].split(']')[0])
    return users_movies_list

In [46]:
users = np.unique(fav_movies['user_id'])
print(users.shape) 

(100,)


In [47]:
users_movies_list = userMovieList(users, fav_movies)

## Sparse Matrix on the dataset

In [48]:
def prepSparseMatrix(list_of_str):
    # list_of_str = A list, which contain strings of users favourite movies separate by comma ",".
    cv = CountVectorizer(token_pattern = r'[^\,\ ]+', lowercase = False)
    sparseMatrix = cv.fit_transform(list_of_str)
    return sparseMatrix.toarray(), cv.get_feature_names()

In [49]:
sparseMatrix, feature_names = prepSparseMatrix(users_movies_list)



In [50]:
df_sparseMatrix = pd.DataFrame(sparseMatrix, index = users, columns = feature_names)
df_sparseMatrix

Unnamed: 0,1,100,1000,10012,10015,10017,1002,10020,10029,1003,...,996,9963,9969,997,9981,9982,9988,9989,9996,9999
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,1,0,0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
first_6_users_SM = fav_movies[fav_movies['user_id'].isin(users[:6])].sort_values('user_id')
first_6_users_SM.T

Unnamed: 0,0,299,298,297,296,295,294,293,292,291,...,2744,2743,2742,2741,2740,2739,2738,2737,2747,2812
user_id,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mal_id,29978.0,10620.0,3550.0,5962.0,333.0,518.0,10519.0,12101.0,25517.0,8310.0,...,32086.0,36308.0,12115.0,12113.0,10218.0,34055.0,32379.0,9513.0,8939.0,37521.0
rating,6.0,10.0,10.0,10.0,10.0,10.0,5.0,7.0,10.0,10.0,...,5.0,10.0,5.0,5.0,5.0,4.0,4.0,9.0,6.0,7.0


In [52]:
df_sparseMatrix.loc[np.unique(first_6_users_SM['user_id']), list(map(str, np.unique(first_6_users_SM['mal_id'])))]

Unnamed: 0,1,5,6,16,19,20,24,27,30,32,...,40936,41053,41168,41226,41570,41590,41611,42091,42519,42603
1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
2,1,1,1,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,1,1,1,0,1,1,...,1,1,1,0,0,0,0,0,1,1
4,0,0,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## K-Mean CLustering

In [53]:
kmeans = KMeans(n_clusters=15, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
clusters = kmeans.fit_predict(sparseMatrix)

In [54]:
users_cluster = pd.DataFrame(np.concatenate((users.reshape(-1,1), clusters.reshape(-1,1)), axis = 1), columns = ['user_id', 'Cluster'])
users_cluster.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
Cluster,9,13,2,5,9,5,5,9,9,1,...,1,9,9,9,5,0,9,9,5,9


## Creating movies of clusters

In [55]:
def clustersMovies(users_cluster, users_data):
    clusters = list(users_cluster['Cluster'])
    each_cluster_movies = list()
    for i in range(len(np.unique(clusters))):
        users_list = list(users_cluster[users_cluster['Cluster'] == i]['user_id'])
        users_movies_list = list()
        for user in users_list:    
            users_movies_list.extend(list(users_data[users_data['user_id'] == user]['mal_id']))
        users_movies_counts = list()
        users_movies_counts.extend([[movie, users_movies_list.count(movie)] for movie in np.unique(users_movies_list)])
        each_cluster_movies.append(pd.DataFrame(users_movies_counts, columns=['mal_id', 'Count']).sort_values(by = ['Count'], ascending = False).reset_index(drop=True))
    return each_cluster_movies
cluster_movies = clustersMovies(users_cluster, fav_movies)

## Get user favourite movie list + Fixing any small cluster within the dataset

In [56]:
def getMovies(user_id, users_data):
    return list(users_data[users_data['user_id'] == user_id]['mal_id'])

In [57]:
def fixClusters(clusters_movies_dataframes, users_cluster_dataframe, users_data, smallest_cluster_size = 11):
    # clusters_movies_dataframes: will be a list which will contain each dataframes of each cluster movies
    # users_cluster_dataframe: will be a dataframe which contain users IDs and their cluster no.
    # smallest_cluster_size: is a smallest cluster size which we want for a cluster to not remove
    each_cluster_movies = clusters_movies_dataframes.copy()
    users_cluster = users_cluster_dataframe.copy()
    # Let convert dataframe in each_cluster_movies to list with containing only movies IDs
    each_cluster_movies_list = [list(df['mal_id']) for df in each_cluster_movies]
    # First we will prepair a list which containt lists of users in each cluster -> [[Cluster 0 Users], [Cluster 1 Users], ... ,[Cluster N Users]] 
    usersInClusters = list()
    total_clusters = len(each_cluster_movies)
    for i in range(total_clusters):
        usersInClusters.append(list(users_cluster[users_cluster['Cluster'] == i]['user_id']))
    uncategorizedUsers = list()
    i = 0
    # Now we will remove small clusters and put their users into another list named "uncategorizedUsers"
    # Also when we will remove a cluster, then we have also bring back cluster numbers of users which comes after deleting cluster
    # E.g. if we have deleted cluster 4 then their will be users whose clusters will be 5,6,7,..,N. So, we'll bring back those users cluster number to 4,5,6,...,N-1.
    for j in range(total_clusters):
        if len(usersInClusters[i]) < smallest_cluster_size:
            uncategorizedUsers.extend(usersInClusters[i])
            usersInClusters.pop(i)
            each_cluster_movies.pop(i)
            each_cluster_movies_list.pop(i)
            users_cluster.loc[users_cluster['Cluster'] > i, 'Cluster'] -= 1
            i -= 1
        i += 1
    for user in uncategorizedUsers:
        elemProbability = list()
        user_movies = getMovies(user, users_data)
        if len(user_movies) == 0:
            print(user)
        user_missed_movies = list()
        for movies_list in each_cluster_movies_list:
            count = 0
            missed_movies = list()
            for movie in user_movies:
                if movie in movies_list:
                    count += 1
                else:
                    missed_movies.append(movie)
            elemProbability.append(count / len(user_movies))
            user_missed_movies.append(missed_movies)
        user_new_cluster = np.array(elemProbability).argmax()
        users_cluster.loc[users_cluster['user_id'] == user, 'Cluster'] = user_new_cluster
        if len(user_missed_movies[user_new_cluster]) > 0:
            each_cluster_movies[user_new_cluster] = each_cluster_movies[user_new_cluster].append([{'mal_id': new_movie, 'Count': 1} for new_movie in user_missed_movies[user_new_cluster]], ignore_index = True)
    return each_cluster_movies, users_cluster

In [58]:
movies_df_fixed, clusters_fixed = fixClusters(cluster_movies, users_cluster, fav_movies, smallest_cluster_size = 6)

## Save and load training data 

In [59]:
class saveLoadFiles:
    def save(self, filename, data):
        try:
            file = open('datasets/' + filename + '.pkl', 'wb')
            pickle.dump(data, file)
        except:
            err = 'Error: {0}, {1}'.format(exc_info()[0], exc_info()[1])
            print(err)
            file.close()
            return [False, err]
        else:
            file.close()
            return [True]
    def load(self, filename):
        try:
            file = open('datasets/' + filename + '.pkl', 'rb')
        except:
            err = 'Error: {0}, {1}'.format(exc_info()[0], exc_info()[1])
            print(err)
            file.close()
            return [False, err]
        else:
            data = pickle.load(file)
            file.close()
            return data
    def loadClusterMoviesDataset(self):
        return self.load('clusters_movies_dataset')
    def saveClusterMoviesDataset(self, data):
        return self.save('clusters_movies_dataset', data)
    def loadUsersClusters(self):
        return self.load('users_clusters')
    def saveUsersClusters(self, data):
        return self.save('users_clusters', data)

## Creating a class function for genre recommendatio based on on user history 

In [60]:
class userRequestedFor:
    def __init__(self, user_id, users_data):
        self.users_data = users_data.copy()
        self.user_id = user_id
        # Find User Cluster
        users_cluster = saveLoadFiles().loadUsersClusters()
        self.user_cluster = int(users_cluster[users_cluster['user_id'] == self.user_id]['Cluster'])
        # Load User Cluster Movies Dataframe
        self.movies_list = saveLoadFiles().loadClusterMoviesDataset()
        self.cluster_movies = self.movies_list[self.user_cluster] # dataframe
        self.cluster_movies_list = list(self.cluster_movies['mal_id']) # list
#     def updatedFavouriteMoviesList(self, new_movie_Id):
#         if new_movie_Id in self.cluster_movies_list:
#             self.cluster_movies.loc[self.cluster_movies['mal_id'] == new_movie_Id, 'Count'] += 1
#         else:
#             self.cluster_movies = self.cluster_movies.append([{'mal_id':new_movie_Id, 'Count': 1}], ignore_index=True)
#         self.cluster_movies.sort_values(by = ['Count'], ascending = False, inplace= True)
#         self.movies_list[self.user_cluster] = self.cluster_movies
#         saveLoadFiles().saveClusterMoviesDataset(self.movies_list)

    def recommendGenre(self):
        try:
            user_movies = getMovies(self.user_id, self.users_data)
            cluster_movies_list = self.cluster_movies_list.copy()
            for user_movie in user_movies:
                if user_movie in cluster_movies_list:
                    cluster_movies_list.remove(user_movie)
            return [True, cluster_movies_list]
        except KeyError:
            err = "User history does not exist"
            print(err)
            return [False, err]
        except:
            err = 'Error: {0}, {1}'.format(exc_info()[0], exc_info()[1])
            print(err)
            return [False, err]

## Merging two datasets based on 'mal_id'

In [61]:
animes_df = pd.read_csv('./datasets/anime_data.csv', usecols=['mal_id', 'title', 'genres'])
animes_df.head(3)

Unnamed: 0,mal_id,genres,title
0,1,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",Cowboy Bebop
1,100,"['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...",Shin Shirayuki-hime Densetsu Prétear
2,1000,"['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...",Uchuu Kaizoku Captain Herlock


In [62]:
df = fav_movies.merge(animes_df, on = 'mal_id')
df

Unnamed: 0,user_id,mal_id,rating,genres,title
0,1,29978,6.0,['Comedy'],001
1,36,29978,5.0,['Comedy'],001
2,70,29978,5.0,['Comedy'],001
3,1,2467,10.0,['Adventure'],3 Choume no Tama: Uchi no Tama Shirimasen ka?
4,1,28789,6.0,"['Music', 'Drama', 'Seinen']",3-gatsu no Lion meets Bump of Chicken
...,...,...,...,...,...
30311,96,32868,4.0,['Music'],Spring Stranger
30312,96,19945,5.0,['Comedy'],Zoku Natsume Yuujinchou: 3D Nyanko-sensei Gekijou
30313,98,22835,7.0,"['Slice of Life', 'Comedy', 'Ecchi', 'School']",Himegoto
30314,100,21595,7.0,"['Action', 'Fantasy', 'Game', 'Shounen']",Yu☆Gi☆Oh! Zexal Second: Iza! Saishuu Kessen e!...


## Get a list movies based on user_id and genre

In [63]:
title = list(df.loc[df['user_id'] == 55]['title'])
if title != []:
    print('Movie title: ', title, ', Genres: [ ', end = '')
    genres = ast.literal_eval(df.loc[df['user_id'] == 55]['genres'].values[0].split('[')[1].split(']')[0])
    for genre in genres:
        print(genre[:20], ', ', end = '')
    print(']', end = '')
    print('')

Movie title:  ['Boku dake ga Inai Machi', 'Boku no Hero Academia', 'Bungou Stray Dogs', 'Bungou Stray Dogs 2nd Season', 'Bungou Stray Dogs 3rd Season', 'Bungou Stray Dogs: Dead Apple', 'Bungou Stray Dogs: Hitori Ayumu', 'Byousoku 5 Centimeter', 'Code Geass: Hangyaku no Lelouch', 'Cross Road', 'Death Note', 'Death Parade', 'Howl no Ugoku Shiro', 'Kaguya-sama wa Kokurasetai: Tensai-tachi no Renai Zunousen', 'Kimi no Na wa.', 'Koe no Katachi', 'Kotonoha no Niwa', 'Kyoukai no Kanata', "Kyoukai no Kanata Movie 2: I'll Be Here - Mirai-hen", 'Naruto', 'Naruto: Shippuuden', 'One Punch Man', 'Re:Zero kara Hajimeru Isekai Seikatsu', 'Sen to Chihiro no Kamikakushi', 'Shelter', 'Shigatsu wa Kimi no Uso', 'Shingeki no Kyojin', 'Shingeki no Kyojin OVA', 'Shingeki no Kyojin Season 2', 'Shingeki no Kyojin Season 3', 'Shingeki no Kyojin Season 3 Part 2', 'Shingeki no Kyojin: Kuinaki Sentaku', 'Tokyo Ghoul', 'Violet Evergarden', 'Violet Evergarden: Kitto "Ai" wo Shiru Hi ga Kuru no Darou', 'Yakusoku no 

## Genre Recommendation system based on user_id

In [68]:
user12Recommendations = userRequestedFor(20, fav_movies).recommendGenre()
for movie in user12Recommendations:
    title = list(df.loc[df['user_id'] == 20]['title'])
    if title != []:
        genres = ast.literal_eval(df.loc[df['user_id'] == 20]['genres'].values[0].split('[')[1].split(']')[0])
        for genre in genres:
            print(genre[:20],  ', ', end = '')
        print(']', end = '')
        print() 

Drama , Romance , Slice of Life , ]
Drama , Romance , Slice of Life , ]


## Top movies

In [65]:
anime_rec = df['title'].values

anime_rec_list = []
for rec in anime_rec:
    if rec not in anime_rec_list:
        anime_rec_list.append(rec)

In [66]:
anime_rec_list[:10]

['001',
 '3 Choume no Tama: Uchi no Tama Shirimasen ka?',
 '3-gatsu no Lion meets Bump of Chicken',
 'Aho Girl',
 'Air',
 'Air Movie',
 'Air Recap',
 'Air in Summer',
 'Akachan to Boku',
 'All Alone With You']