Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

adding pagerank implementation with convergence criteria

  • Loading branch information...
commit 54c77c1d42e33913bcada22a6561fb36e05df3ee 1 parent 9ea5e42
@julienledem authored
Showing with 58 additions and 0 deletions.
  1. +58 −0 Page Rank/pagerank2.py
View
58 Page Rank/pagerank2.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python
+from org.apache.pig.scripting import *
+
+P = Pig.compile("""
+previous_pagerank =
+ LOAD '$docs_in'
+ AS ( url: chararray, pagerank: float, links:{ link: ( url: chararray ) } );
+
+outbound_pagerank =
+ FOREACH previous_pagerank
+ GENERATE
+ pagerank / COUNT ( links ) AS pagerank,
+ FLATTEN ( links ) AS to_url;
+
+new_pagerank =
+ FOREACH
+ ( COGROUP outbound_pagerank BY to_url, previous_pagerank BY url INNER )
+ GENERATE
+ group AS url,
+ ( 1 - $d ) + $d * SUM ( outbound_pagerank.pagerank ) AS pagerank,
+ FLATTEN ( previous_pagerank.links ) AS links,
+ FLATTEN ( previous_pagerank.pagerank ) AS previous_pagerank;
+
+pagerank_diff = FOREACH new_pagerank GENERATE ABS ( previous_pagerank - pagerank );
+
+max_diff =
+ FOREACH
+ ( GROUP pagerank_diff ALL )
+ GENERATE
+ MAX ( pagerank_diff );
+
+STORE new_pagerank
+ INTO '$docs_out';
+
+STORE max_diff
+ INTO '$max_diff';
+
+""")
+
+d = 0.5
+docs_in= "data/pagerank_data_simple"
+
+for i in range(10):
+ docs_out = "out/pagerank_data_" + str(i + 1)
+ max_diff = "out/max_diff_" + str(i + 1)
+ Pig.fs("rmr " + docs_out)
+ Pig.fs("rmr " + max_diff)
+ stats = P.bind().runSingle()
+ if not stats.isSuccessful():
+ raise 'failed'
+ max_diff_value = float(str(stats.result("max_diff").iterator().next().get(0)))
+ print " max_diff_value = " + str(max_diff_value)
+ if max_diff_value < 0.01:
+ print "done at iteration " + str(i)
+ break
+ docs_in = docs_out
+
+
Please sign in to comment.
Something went wrong with that request. Please try again.