In [1]:
#All the libraries files required for the code
import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
import random

In [2]:
#Importing both the file using pandas 
books = pd.read_csv('books.csv')
rating = pd.read_csv('ratings.csv')

In [3]:
books.head(2)

Unnamed: 0,booksId,title,genres
0,1,Toy Story (1995),Adventure|Cartoon|Children|Fictional|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
rating.head(2)

Unnamed: 0,userId,booksId,rating,ISBN
0,83438,497,5.0,1081992433
1,83438,500,3.5,1081992600


In [5]:
#Deleting unnecessary columns

rating = rating.drop(['ISBN'],axis = 1)

In [6]:
rating.head()

Unnamed: 0,userId,booksId,rating
0,83438,497,5.0
1,83438,500,3.5
2,83438,534,4.5
3,83438,543,5.0
4,83438,587,3.5


In [7]:
#Merging both the dataframes
data = pd.merge(rating , books , how='outer', on='booksId')

In [8]:
data.head()

Unnamed: 0,userId,booksId,rating,title,genres
0,83438.0,497,5.0,Much Ado About Nothing (1993),Fictional|Romance
1,83446.0,497,5.0,Much Ado About Nothing (1993),Fictional|Romance
2,83457.0,497,4.0,Much Ado About Nothing (1993),Fictional|Romance
3,83466.0,497,3.0,Much Ado About Nothing (1993),Fictional|Romance
4,83479.0,497,5.0,Much Ado About Nothing (1993),Fictional|Romance


In [9]:
# Data Processing
# Converting Genres into different columns 
# Here we just create columns and put there initial value as 0
x = data.genres
a = list()
for i in x:
    abc = i
    a.append(abc.split('|'))
a = pd.DataFrame(a)   
b = a[0].unique()
for i in b:
    data[i] = 0
data.head(2)

Unnamed: 0,userId,booksId,rating,title,genres,Fictional,Drama,Adventure,Crime,Action,...,Horror,Children,Western,Documentary,Sci-Fi,Romance,Book-Noir,War,(no genres listed),IMAX
0,83438.0,497,5.0,Much Ado About Nothing (1993),Fictional|Romance,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,83446.0,497,5.0,Much Ado About Nothing (1993),Fictional|Romance,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# we assign 1 to all the columns which are present in the Genres
for i in b:
    data.loc[data['genres'].str.contains(i), i] = 1

  data.loc[data['genres'].str.contains(i), i] = 1


In [11]:
data.head(2)

Unnamed: 0,userId,booksId,rating,title,genres,Fictional,Drama,Adventure,Crime,Action,...,Horror,Children,Western,Documentary,Sci-Fi,Romance,Book-Noir,War,(no genres listed),IMAX
0,83438.0,497,5.0,Much Ado About Nothing (1993),Fictional|Romance,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,83446.0,497,5.0,Much Ado About Nothing (1993),Fictional|Romance,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [12]:
# Now there is no use of genre 
# Since we have movie id so there is no need for movie names as well
data = data.drop(['genres','title'],axis =1)
data.head()

Unnamed: 0,userId,booksId,rating,Fictional,Drama,Adventure,Crime,Action,Cartoon,Fantasy,...,Horror,Children,Western,Documentary,Sci-Fi,Romance,Book-Noir,War,(no genres listed),IMAX
0,83438.0,497,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,83446.0,497,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,83457.0,497,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,83466.0,497,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,83479.0,497,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [13]:
data.columns

Index(['userId', 'booksId', 'rating', 'Fictional', 'Drama', 'Adventure',
       'Crime', 'Action', 'Cartoon', 'Fantasy', 'Thriller', 'Mystery',
       'Musical', 'Horror', 'Children', 'Western', 'Documentary', 'Sci-Fi',
       'Romance', 'Book-Noir', 'War', '(no genres listed)', 'IMAX'],
      dtype='object')

In [14]:
# Because of merging some null values are created
data.isnull().sum()

userId                37377
booksId                   0
rating                37377
Fictional                 0
Drama                     0
Adventure                 0
Crime                     0
Action                    0
Cartoon                   0
Fantasy                   0
Thriller                  0
Mystery                   0
Musical                   0
Horror                    0
Children                  0
Western                   0
Documentary               0
Sci-Fi                    0
Romance                   0
Book-Noir                 0
War                       0
(no genres listed)        0
IMAX                      0
dtype: int64

In [15]:
#We simply drop the null values as they are not treatable
data.dropna(inplace= True )

In [16]:
data.isnull().sum()

userId                0
booksId               0
rating                0
Fictional             0
Drama                 0
Adventure             0
Crime                 0
Action                0
Cartoon               0
Fantasy               0
Thriller              0
Mystery               0
Musical               0
Horror                0
Children              0
Western               0
Documentary           0
Sci-Fi                0
Romance               0
Book-Noir             0
War                   0
(no genres listed)    0
IMAX                  0
dtype: int64

In [17]:
#assumed that the value of the cluster as 8 for better results.
kmeanModel = KMeans(n_clusters=8)
kmeanModel.fit(data)

KMeans()

In [18]:
# Creating an extra column in data for storing the cluster values
data['Cluster'] = kmeanModel.labels_
data['Cluster'].sample(n=10)

814725     6
407025     6
90040      0
544242     5
182147     6
449441     6
1006563    6
124897     6
669619     0
433055     6
Name: Cluster, dtype: int32

In [19]:
data['Cluster'].value_counts()

6    416903
0    416324
5     60234
1     42358
7     40745
3     34643
4     24653
2     12715
Name: Cluster, dtype: int64

In [20]:
data.head()

Unnamed: 0,userId,booksId,rating,Fictional,Drama,Adventure,Crime,Action,Cartoon,Fantasy,...,Children,Western,Documentary,Sci-Fi,Romance,Book-Noir,War,(no genres listed),IMAX,Cluster
0,83438.0,497,5.0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,83446.0,497,5.0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,83457.0,497,4.0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,83466.0,497,3.0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,83479.0,497,5.0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [21]:
# When we merge the dataframe for a single book multiple rows were created so a single book is allotted
# too many clusters so here we allot a single cluster to a book
# the Cluster which occurs maximum number of times is alloted to the book 
e = []
def fi(group):
    a = pd.DataFrame(group)
    b = pd.DataFrame(a['Cluster'].value_counts())
    d = a.index 
    c = [a['booksId'][d[0]],int(b.idxmax())]
    e.append(c)

In [22]:
data.groupby("booksId").apply(lambda x: fi(x))

In [23]:
e = pd.DataFrame(e)

In [24]:
e.head()

Unnamed: 0,0,1
0,1,6
1,2,6
2,3,0
3,4,6
4,5,0


In [25]:
# Here just the column names are swapped
e.rename(columns = {0:'booksId',1:'Cluster'},inplace=True)
e.drop_duplicates(inplace=True)

In [26]:
e.head(10)

Unnamed: 0,booksId,Cluster
0,1,6
1,2,6
2,3,0
3,4,6
4,5,0
5,6,0
6,7,0
7,8,6
8,9,6
9,10,0


In [27]:
books = pd.read_csv('books.csv')
new_data = pd.merge(e , books , how='outer', on='booksId')

In [28]:
# restoring the books that were deleted while merging the file  
new_data.isnull().sum()

booksId        0
Cluster    37377
title          0
genres         0
dtype: int64

In [29]:
# labelling the books randomly
new_data.fillna(random.randint(0,8),inplace=True)

In [30]:
new_data.isnull().sum()

booksId    0
Cluster    0
title      0
genres     0
dtype: int64

In [31]:
#This function select the cluster for a user according the the user choice
def select_c():
    global l
    print('The recommended books are as:')
    l=[]
    for i in range(15):
        l.append(random.randint(0,3883))
    for i in l:
        print(new_data['booksId'][i] , new_data['title'][i],sep='--->')
    print('--------------------------------------------------------------------')
    l = int(input())
    l = new_data['Cluster'][new_data.booksId == l]

In [32]:
# This is the main function which recommend you books.
def recommend_books():
    ans = False
    while not ans:
        select_c()
        print(new_data['title'][new_data.Cluster == int(l)].sample(n=10))
        print('--------------------------------------------------------------------')
        print('Do you like these books(y/n)')
        abc = input()
        while ((abc =='y') or (abc == 'Y')):          
            print(new_data['title'][new_data.Cluster == int(l)].sample(n=10))
            print('--------------------------------------------------------------------')
            print('Do you want to select more boooks? (y/n)')
            abc = input()
            if ((abc =='N') or (abc == 'n')):
                ans =True

In [None]:
recommend_books()

The recommended books are as:
501--->Naked (1993)
415--->Another Stakeout (1993)
230--->Dolores Claiborne (1995)
2863--->Hard Day's Night, A (1964)
3003--->Train of Life (Train de vie) (1998)
3847--->Ilsa, She Wolf of the SS (1974)
200--->Tie That Binds, The (1995)
949--->East of Eden (1955)
3072--->Moonstruck (1987)
3232--->Seven Chances (1925)
2227--->Lodger: A Story of the London Fog, The (1927)
806--->American Buffalo (1996)
2935--->Lady Eve, The (1941)
2878--->Hell Night (1981)
3076--->Irma la Douce (1963)
--------------------------------------------------------------------
3003
1193                   Diva (1981)
3393               Dinosaur (2000)
5276    Ernest Goes to Jail (1990)
5163              Zebrahead (1992)
7782    Alexander the Great (1956)
5399            Carbon Copy (1981)
257               Enfer, L' (1994)
876            Citizen Kane (1941)
7437                 Riders (2002)
767           She's the One (1996)
Name: title, dtype: object
--------------------------------