In [59]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
from sklearn.model_selection import train_test_split
%matplotlib inline
import math

In [60]:
print("Start reading data frm csv")
anime = pd.read_csv('../input/anime.csv')
rating = pd.read_csv('../input/rating.csv')
print('anime.csv shape', anime.shape)
print('rating.csv shape', rating.shape)
print(anime[:1])
print(rating[:1])

Start reading data frm csv
anime.csv shape (12294, 7)
rating.csv shape (7813737, 3)
   anime_id            name   ...   rating members
0     32281  Kimi no Na wa.   ...     9.37  200630

[1 rows x 7 columns]
   user_id  anime_id  rating
0        1        20      -1


In [61]:
rating = rating[rating.rating != -1]
print('After removing -1 entries')
print('rating.shape',rating.shape)
print(rating.head())

After removing -1 entries
rating.shape (6337241, 3)
     user_id  anime_id  rating
47         1      8074      10
81         1     11617      10
83         1     11757      10
101        1     15451      10
153        2     11771      10


In [62]:
limit_user_id = 1000
limit_anime_id = int(limit_user_id*2)
rating = rating[rating.user_id<=limit_user_id ]
rating = rating[rating.anime_id<=limit_anime_id]

print('after limiting users to id<=', limit_user_id, 'animes to id<=', limit_anime_id)
print(rating.shape)
print(rating.head())

train = rating.sample(frac=0.75,random_state=5)
test = rating.drop(train.index)

print('train.shape',train.shape)
print(train.head())

print('test.shape',test.shape)
print(test.head())

after limiting users to id<= 1000 animes to id<= 2000
(17984, 3)
     user_id  anime_id  rating
156        3        20       8
157        3       154       6
158        3       170       9
159        3       199      10
160        3       225       9
train.shape (13488, 3)
       user_id  anime_id  rating
37766      417       527      10
20116      248      1559       9
71149      744       877       7
18740      234       813       7
94224      968      1575       8
test.shape (4496, 3)
     user_id  anime_id  rating
159        3       199      10
161        3       341       6
166        3      1119       7
168        3      1122       7
172        3      1526       7


In [63]:
u_users = rating.groupby(['user_id'], as_index=False).count()
u_users = len(u_users)
unique_users = limit_user_id

u_animes = rating.groupby(['anime_id'], as_index=False).count()
u_animes = len(u_animes)
unique_animes = limit_anime_id#len(unique_animes)

print('The chunked dataset\'s unique_users',u_users, 'unique_animes',u_animes)
print('When limiting users to id<=', limit_user_id, 'animes to id<=', limit_anime_id)
unique_users += 1
unique_animes += 1


The chunked dataset's unique_users 851 unique_animes 1232
When limiting users to id<= 1000 animes to id<= 2000


In [64]:
from sklearn.metrics.pairwise import cosine_similarity
X_input_to_cosine_sim = np.zeros((unique_users,unique_animes))
for idx,row in train.iterrows():
    u1 = row['user_id']
    i1 = row['anime_id']
    r = row['rating']
    X_input_to_cosine_sim[u1][i1] = r
mat_T = X_input_to_cosine_sim.T
user_similarity = cosine_similarity(mat_T)
#item_similarity = cosine_similarity(X_input_to_cosine_sim.T)

In [65]:
def GetData(X,Y):
  users_data = {}
  movies_data = {}
  first = 0
  for i in range(len(X)):
    #print(row)
    first += 1
    if(first>1):
        userId = X[i][0]
        movieId = X[i][1]
        rating = Y[i]
        if userId not in users_data:
            users_data[userId] = {}
            users_data[userId][movieId] = rating
        else:
            users_data[userId][movieId] = rating
        if movieId not in movies_data:
            movies_data[movieId]= {}
            movies_data[movieId][userId] = rating
        else:
            movies_data[movieId][userId] = rating
  return users_data,movies_data

In [66]:
y = np.array(train)[:,-1]
X = np.array(train)[:,:-1]
y = list(y)
users_train_data,movies_train_data = GetData(X,y)

In [67]:
unique_movies = unique_animes
def GetCosineSimilarity(u ,v):
    if u not in users_train_data or v not in users_train_data:
        return 0
    mod_u = 0
    mod_v = 0
    pdt = 0
    for i in range(0, unique_movies):
        if i not in users_train_data[u]:
            r_ui = 0
        else:
            r_ui = users_train_data[u][i]
        if i not in users_train_data[v]:
            r_vi = 0
        else:
            r_vi = users_train_data[v][i]
        mod_u += r_ui*r_ui
        mod_v += r_vi*r_vi
        pdt += r_ui*r_vi
    
    if(pdt == 0):
        return pdt
    return pdt/(math.sqrt(mod_u)*math.sqrt(mod_v))


In [68]:
user_avg_rating = np.zeros((unique_users,1))
for i in range(unique_users):
    if i not in users_train_data:
        user_avg_rating[i]=0
        continue
    user_avg_rating[i] = np.mean(list(users_train_data[i].values()))
print('user_avg_rating.shape',user_avg_rating.shape)    
#print(len(cosine_similarity_arr))

user_avg_rating.shape (1001, 1)


In [69]:
def GetPredictions(similarity_array):
    predictions = np.zeros((unique_users, unique_animes))

    k = 0.05
    for c in range(unique_users):
        if c not in users_train_data:
            continue
            
        for s in range(unique_animes):
            if s not in movies_train_data:
                continue
                
            r_cs = 0
            r_c_bar = user_avg_rating[c][0]
            r_cs += r_c_bar
            pdt = 0
            
            for c_dash in movies_train_data[s]:
                if c_dash not in users_train_data:
                    continue
                   
                if c_dash>=unique_users:
                    continue
                    
                #print(c,c_dash)
                
                r_c_dash_bar = user_avg_rating[c_dash][0]
                if(c_dash != c):
                    r_c_dash_s = movies_train_data[s][c_dash]
                    sim_coeff = similarity_array[c][c_dash]
                    pdt += sim_coeff * (r_c_dash_s - r_c_dash_bar)
            pdt = pdt * k 
            r_cs += pdt
            predictions[c][s] = r_cs
    return predictions

In [70]:
cosine_ratings_predictions = GetPredictions(user_similarity)
print('Ratings Predictions:')
print('shape:',cosine_ratings_predictions.shape)
#print(cosine_ratings_predictions)

Ratings Predictions:
shape: (1001, 2001)


In [71]:
y_test_data = np.array(test)[:,-1]
X_test_data = np.array(test)[:,:-1]
y_test_data = list(y_test_data)

users_test_data, movies_test_data = GetData(X_test_data, y_test_data)


In [72]:
def GetErrors(predicted_data):
    MAE = 0
    count = 0
    for u in range(unique_users):
        for i in range(unique_movies):
            if u not in users_test_data:
                continue
            #print(u,i)
            if i in users_test_data[u]:
                actual_y = users_test_data[u][i]
                predicted_y = predicted_data[u][i]
                MAE += np.abs(predicted_y - actual_y)
                count += 1
    MAE = MAE/(count)

    RMSE = 0
    count = 0
    for u in range(unique_users):
        for i in range(unique_movies):
            #test_predict = 0
            if u not in users_test_data:
                continue
            #print(u,i)
            if i in users_test_data[u]:
                actual_y = users_test_data[u][i]
                predicted_y = predicted_data[u][i]
                sqr_err = 0
                sqr_err = predicted_y - actual_y
                sqr_err = sqr_err*sqr_err
                RMSE += sqr_err
                count += 1
    RMSE = RMSE/(count)
    RMSE = np.sqrt(RMSE)
    return MAE,RMSE


In [73]:
print('For Cosine Similarity')
print('For user_ids<',unique_users,'and anime_ids<', unique_animes)
r,m = GetErrors(cosine_ratings_predictions)
print('With train data of shape:',train.shape)
print('and test data of shape:',test.shape)
print('MAE:', m)
print('RMSE:', r)

For Cosine Similarity
For user_ids< 1001 and anime_ids< 2001
With train data of shape: (13488, 3)
and test data of shape: (4496, 3)
MAE: 1.8749642901484516
RMSE: 1.2439472321432081


In [74]:
print(cosine_ratings_predictions)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         8.09706826 0.         ... 0.         0.         8.        ]
 [0.         7.         0.         ... 0.         0.         7.        ]
 [0.         8.63157895 0.         ... 0.         0.         8.63157895]]


In [77]:

cosine_similarity_arr = [ 0 for x in range(unique_users) ]
c = [cosine_similarity_arr for x in range(unique_users)]
cosine_similarity_arr = c

print('cos sim started')

import time
start = time.time()
for u in range(0, unique_users):
    print(u)
    for v in range(u+1, unique_users):
        cosine_similarity_arr[u][v] = GetCosineSimilarity(u,v)        
        cosine_similarity_arr[v][u] = cosine_similarity_arr[u][v]
end = time.time()
print('done in time:', end-start)

my_cosine_ratings_predictions = GetPredictions(cosine_similarity_arr)
print('My Cosine Similarity')
print('For user_ids<',unique_users,'and anime_ids<', unique_animes)
r,m = GetErrors(my_cosine_ratings_predictions)
print('With train data of shape:',train.shape)
print('and test data of shape:',test.shape)
print('MAE:', m)
print('RMSE:', r)
''''''

cos sim started
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
27

''

In [78]:
print(my_cosine_ratings_predictions)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         8.85032217 0.         ... 0.         0.         7.99519086]
 [0.         7.85032217 0.         ... 0.         0.         6.99519086]
 [0.         9.48190112 0.         ... 0.         0.         8.6267698 ]]
