-
Notifications
You must be signed in to change notification settings - Fork 0
/
PseudoCode.txt
450 lines (364 loc) · 12.2 KB
/
PseudoCode.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
*************************************************************************
GlobalAverage
--------------
input : ratings files
Mapper
--------
let map(k,v) =
foreach line in stdin:
data = split the line on "::" to get(userid, movieid, rating)
emit("GAvg: " + rating)
Output:"GAvg: " + rating
Reducer
---------
current_key = null
count = 0
sum = 0
let reduce(k,vals) =
foreach line in stdin:
Split line into a list to get(key and val)
if current_key != null and current_key != key:
avg = sum/count
emit(current_key + " " + avg)
sum = 0
count = 0
else
count = count+1
sum = sum + val
current_key = key
global_avg = sum/count
emit("GAvg: " + global_avg)
Output:"GAvg: " + global_avg
*************************************************************************
*************************************************************************
UserAverage
--------------
input : ratings files
Mapper
--------
dictionary userRatings = new dictionary()
let map(k,v) =
foreach line in stdin:
data = split the line on "::" to get(userid, movieid, rating)
addToDictionary(userid, rating)
emitUserRatings()
let addToDictionary(userid, rating):
if !(userid in dictioanry userRatings):
create a list and add rating to it.
add this list to dictionary userRatings for that userid
else:
append the rating for the userid to the existing list value.
let emitUserRatings():
foreach userid in dictioanry userRatings:
ratingStr = create a space-seperated string of ratings for that userid
emit("URat " + userid + " " + ratingStr)
Output:"URat " + userid + " " + ratingStr
Reducer
---------
current_key = null
count = 0
sum = 0
highestUser = 0
let reduce(k,vals) =
foreach line in stdin:
Split line into a list only for first two spaces
userid = data[1]
ratings = data[2] // this is a space-separated string of ratings for that user
key = userid
if current_key != null and current_key != key:
avg = sum/count
emit(current_key + " " + avg)
sum = 0
count = 0
if userid > highestUser:
highestUser = userid
ratingsLst = split ratings into a list.
foreach rating in ratingsLst:
count = count + 1
sum = sum + rating
current_key = key
user_avg = sum/count
emit(current_key + " " + user_avg)
emit("HighestUser " + highestUser)
Output: userid1 userid1_avg
userid2 userid2_avg
...
HighestUser highestUser
*************************************************************************
*************************************************************************
MovieAverage
--------------
input : ratings files
Mapper
--------
dictionary movieRatings = new dictionary()
let map(k,v) =
foreach line in stdin:
data = split the line on "::" to get(userid, movieid, rating)
addToDictionary(movieid, rating)
emitMovieRatings()
let addToDictionary(movieid, rating):
if !(movieid in dictioanry movieRatings):
create a list and add rating to it.
add this list to dictionary movieRatings for that movieid
else:
append the rating for the movieid to the existing list value.
let emitMovieRatings():
foreach movieid in dictioanry movieRatings:
ratingStr = create a space-seperated string of ratings for that movieid
emit("MRat " + movieid + " " + ratingStr)
Output:"MRat " + movieid + " " + ratingStr
Reducer
---------
current_key = null
count = 0
sum = 0
highestMovie = 0
let reduce(k,vals) =
foreach line in stdin:
Split line into a list only for first two spaces
movieid = data[1]
ratings = data[2] // this is a space-separated string of ratings for that movie
key = movieid
if current_key != null and current_key != key:
avg = sum/count
emit(current_key + " " + avg)
sum = 0
count = 0
if movieid > highestMovie:
highestMovie = movieid
ratingsLst = split ratings into a list.
foreach rating in ratingsLst:
count = count + 1
sum = sum + rating
current_key = key
movie_avg = sum/count
emit(current_key + " " + movie_avg)
emit("HighestMovie " + highestMovie)
Output: movieid1 movieid1_avg
movieid2 movieid2_avg
...
HighestMovie highestMovie
*************************************************************************
*************************************************************************
Initial UV Matrix
------------------
input: a) ratings file
b) globalAverage file
c) userAverage file
d) movieAverage file
Mapper
-----------
dictionary userAvgRatings = new dictionary()
dictionary movieAvgRatings = new dictionary()
let initializeUserAvgRatings() =
open(userRatings_Avg.txt) file
foreach line in file:
Split line into data (userid, and userAvgRating)
if userid != "HighestUser":
userAvgRatings[userid] = userAvgRating
let initializeMovieAvgRatings() =
open(movieRatings_Avg.txt) file
foreach line in file:
Split line into data (movieid, and movieAvgRating)
if movieid != "HighestMovie":
movieAvgRatings[movieid] = movieAvgRating
let map(k,v) =
initializeUserAvgRatings()
initializeMovieAvgRatings()
foreach line in stdin:
data = split the line on "::" to get(userid, movieid, rating)
normRating = rating - (0.5*userAvgRatings[userId]) - (0.5*movieAvgRatings[movieId])
emit("NV "+ userid + " " + movieid + " " + normRating)
Output: "NV "+ userid + " " + movieid + " " + normRating
Reducer
-----------
latentFactors = 5 # latent factors
alpha = 0.040 #learning rate
lambda_val = 0.3 # regularixation factor
users = 0
movies = 0
let getHighestUserCount():
open(userRatings_Avg.txt) file
for line in userAvgFile:
read the HighestUser value and store it in users
let getHighestMovieCount():
open(movieRatings_Avg.txt) file
for line in movieAvgFile:
read the HighestMovie value and store it in movies
let emitUMatrix():
count = 0
foreach uMatRow in UMat:
urowVals = create a space-seperated string of ratings for that movieid
emit("U " + count + " " + latentFactors + " " + urowVals)
count += 1
let emitVMatrix():
count = 0
foreach vMatRow in VMat:
vrowVals = create a space-seperated string of ratings for that movieid
emit("V " + count + " " + latentFactors + " " + vrowVals)
count += 1
let reduce(k,vals)=
getHighestUserCount()
getHighestMovieCount()
UMat = initialize with random values
VMat = initialize with random values
foreach line in stdin:
Split the line into (userid, movieid, normRating)
error = normRating - (dot product of UMat[userid] and VMat[movieid])
foreach latenfactor:
Update UMat using UMat_current_Val, alpha, error, lambda and VMat
Update VMat using VMat_current_Val, alpha, error, lambda and UMat
emitUMatrix()
emitVMatrix()
Output: "U " + rowNum + " " + TotalCols + " " + urowVals
...
...
"V " + rowNum + " " + TotalCols + " " + vrowVals
...
...
*************************************************************************
*************************************************************************
Updating U and V Matrices
----------------------------
input: a) ratings file
b) globalAverage file
c) userAverage file
d) movieAverage file
e) previous UVMat file
Mapper
--------
dictionary userAvgRatings = new dictionary()
dictionary movieAvgRatings = new dictionary()
let initializeUserAvgRatings() =
open(userRatings_Avg.txt) file
foreach line in file:
Split line into data (userid, and userAvgRating)
if userid != "HighestUser":
userAvgRatings[userid] = userAvgRating
let initializeMovieAvgRatings() =
open(movieRatings_Avg.txt) file
foreach line in file:
Split line into data (movieid, and movieAvgRating)
if movieid != "HighestMovie":
movieAvgRatings[movieid] = movieAvgRating
let map(k,v) =
initializeUserAvgRatings()
initializeMovieAvgRatings()
foreach line in stdin:
data = split the line on "::" to get(userid, movieid, rating)
normRating = rating - (0.5*userAvgRatings[userId]) - (0.5*movieAvgRatings[movieId])
emit("NV "+ userid + " " + movieid + " " + normRating)
Output: "NV "+ userid + " " + movieid + " " + normRating
Reducer
--------
latentFactors = 5 # latent factors
alpha = 0.040 #learning rate
lambda_val = 0.3 # regularixation factor
users = 0
movies = 0
let emitUMatrix():
count = 0
foreach uMatRow in UMat:
urowVals = create a space-seperated string of ratings for that movieid
emit("U " + count + " " + latentFactors + " " + urowVals)
count += 1
let emitVMatrix():
count = 0
foreach vMatRow in VMat:
vrowVals = create a space-seperated string of ratings for that movieid
emit("V " + count + " " + latentFactors + " " + vrowVals)
count += 1
let getHighestUserCount():
open(userRatings_Avg.txt) file
for line in userAvgFile:
read the HighestUser value and store it in users
let getHighestMovieCount():
open(movieRatings_Avg.txt) file
for line in movieAvgFile:
read the HighestMovie value and store it in movies
let reduce(k,vals)=
users = getHighestUserCount()
movies = getHighestMovieCount()
Read the UVMat.txt file and initialize the U and V matrices with values from the file.
foreach line in stdin:
Split the line into (userid, movieid, normRating)
error = normRating - (dot product of UMat[userid] and VMat[movieid])
// Gradient Descent formula to update U and V Matrices
foreach latenfactor:
Update UMat using UMat_current_Val, alpha, error, lambda and VMat
Update VMat using VMat_current_Val, alpha, error, lambda and UMat
emitUMatrix()
emitVMatrix()
Output: "U " + rowNum + " " + TotalCols + " " + urowVals
...
...
"V " + rowNum + " " + TotalCols + " " + vrowVals
...
...
*************************************************************************
*************************************************************************
Calculate RMSE
----------------
input: a) ratings file
b) globalAverage file
c) userAverage file
d) movieAverage file
e) final UVMat file
Mapper
-------
dictionary userAvgRatings = new dictionary()
dictionary movieAvgRatings = new dictionary()
let initializeGlobalAvgRatings() =
open(globalAvg.txt) file
foreach line in file:
Split line into data (tag, and globalAvgRating)
set the globalAvgRating
let initializeUserAvgRatings() =
open(userRatings_Avg.txt) file
foreach line in file:
Split line into data (userid, and userAvgRating)
if userid != "HighestUser":
userAvgRatings[userid] = userAvgRating
if userid == "HighestUser":
users = userAvgRating
let initializeMovieAvgRatings() =
open(movieRatings_Avg.txt) file
foreach line in file:
Split line into data (movieid, and movieAvgRating)
if movieid != "HighestMovie":
movieAvgRatings[movieid] = movieAvgRating
if movieid == "HighestMovie":
movies = movieAvgRating
let map(k,v) =
globalAvgRating = initializeGlobalAvgRatings()
users = initializeUserAvgRatings()
movies = initializeMovieAvgRatings()
Read the UVMat.txt file and initialize the U and V matrices with values from the file.
foreach line in stdin:
data = split the line on "::" to get(userid, movieid, rating)
if userid not in the userAvgRatings
userAvgRatings[userid] = globalAvgRating
if movieid not in movieAvgRatings
movieAvgRatings[movieid] = globalAvgRating
estimated_norm_rating = 0
UMatRowCnt = get row count for U matrix
VMatRowCnt = get row count for V matrix
if userid < UMatRowCnt or movieId < VMatRowCnt:
estimated_norm_rating = dot product of UMat[userid] and VMat[movieid]
unnormalizedRating = estimated_norm_rating + (0.5*userAvgRatings[userid]) + (0.5*movieAvgRatings[movieid])
emit("RMSE " + userId + " " + movieId + " " + rating + " " + unnormalized_rating)
Output: "NV "+ userid + " " + movieid + " " + normRating
Reducer
--------
let reduce(k,vals)=
Split the lines and get(userid, movieid, rating and unnormalizedRating)
Compute the error_unnormalized = rating - unnormalizedRating
//Compute SSE
unnormalizedSSE = unnormalizedSSE + error_unnormalized * error_unnormalized
nCount++
//Compute SSE
unnormalizedRMSE = sqrt(unnormalizedSSE/nCount)
emit("unnormalizedRMSE " + unnormalizedRMSE)
Output: "unnormalizedRMSE " + unnormalizedRMSE
*************************************************************************