Permalink
Browse files

initial commit

  • Loading branch information...
0 parents commit b0369b9d493ba1edd144557d8b3d02422c8d44b8 @pforemski pforemski committed Nov 26, 2012
Showing with 257 additions and 0 deletions.
  1. +8 −0 .gitignore
  2. +55 −0 README.rst
  3. +11 −0 setup.py
  4. +75 −0 the making of/README.rst
  5. +31 −0 the making of/countwords.py
  6. +40 −0 the making of/mergedb.py
  7. +10 −0 the making of/parsewiki.py
  8. +25 −0 wikiwords/__init__.py
  9. +1 −0 wikiwords/freqdb.py
  10. +1 −0 wikiwords/occdb.py
@@ -0,0 +1,8 @@
+# Compiled python modules.
+*.pyc
+
+# Setuptools distribution folder.
+/dist/
+
+# Python egg metadata, regenerated from source files by setuptools.
+/*.egg-info
@@ -0,0 +1,55 @@
+Frequency of words in the English 2012 Wikipedia
+================================================
+
+This Python module can be used to quickly retrieve absolute word frequency
+for English language, as used in Wikipedia articles as of year 2012.
+
+Author: Paweł Foremski <pjf@iitis.pl>, IITiS PAN www.iitis.pl
+
+Usage
+-----
+ >>> import wikiwords
+ >>> wikiwords.freq("monty")
+ 6.348454761413523e-06
+ >>> wikiwords.occ("python")
+ 18972
+ >>> wikiwords.freq("no such word", lambda x: 1./len(x))
+ 0.08333333333333333
+
+Installation
+------------
+ $ sudo pip install wikiwords
+
+Details
+-------
+
+
+Wikipedia files were downloaded on 23.11.2012
+
+The corpus was filtered:
+
+1. all words containing non-latin letters were removed (A-Z)
+2. in single files (see below), words with frequency < 5e-7 were removed
+3. in the final file (28 files merged), words with f < 5e-8 were removed
+4. all words shorter than 2 characters were removed (see below)
+
+The final corpus contains over 100,000 words with over 2 billion occurances.
+
+For copyright information on data, see
+ http://en.wikipedia.org/wiki/Wikipedia:Copyrights
+
+The Python source code is licensed to you under the GNU GPL v3
+
+See "the making of/" subdirectory for details on how the data was created.
+
+The single letter words
+-----------------------
+The single-letter words were removed by the gensim parser. In order to address
+this, relevant frequencies and number of occurances were artifically injected
+from the Google Web Trillion Word Corpus, described by Thorsten Brants and Alex
+Franz in 2006 [1]. For more information -- and for an example of possible
+application of wikiwords.py -- see Peter Norvig ngrams site at [2].
+
+1. http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html
+2. http://norvig.com/ngrams/
+
@@ -0,0 +1,11 @@
+from setuptools import setup
+
+setup(name='wikiwords',
+ version='0.5',
+ description='Word frequencies in 2012 English Wikipedia',
+ url='http://github.com/iitis/wikiwords',
+ author='Pawel Foremski',
+ author_email='pjf@iitis.pl',
+ license='the Wikipedia license + GNU GPL',
+ packages=['wikiwords'],
+ zip_safe=False)
@@ -0,0 +1,75 @@
+Data source
+-----------
+
+http://dumps.wikimedia.org/enwiki/latest/
+
+Included files
+--------------
+
+ enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2
+ 2012-Nov-02 06:14:33 40.4M application/x-bzip
+ enwiki-latest-pages-articles2.xml-p000010002p000024999.bz2
+ 2012-Nov-02 06:15:27 66.0M application/x-bzip
+ enwiki-latest-pages-articles3.xml-p000025001p000055000.bz2
+ 2012-Nov-02 06:16:58 100.4M application/x-bzip
+ enwiki-latest-pages-articles4.xml-p000055002p000104998.bz2
+ 2012-Nov-02 06:17:37 101.7M application/x-bzip
+ enwiki-latest-pages-articles5.xml-p000105002p000184999.bz2
+ 2012-Nov-02 06:21:21 136.5M application/x-bzip
+ enwiki-latest-pages-articles6.xml-p000185003p000305000.bz2
+ 2012-Nov-02 06:21:05 163.7M application/x-bzip
+ enwiki-latest-pages-articles7.xml-p000305002p000464996.bz2
+ 2012-Nov-02 06:23:11 188.7M application/x-bzip
+ enwiki-latest-pages-articles8.xml-p000465001p000665000.bz2
+ 2012-Nov-02 06:25:02 179.6M application/x-bzip
+ enwiki-latest-pages-articles9.xml-p000665001p000925000.bz2
+ 2012-Nov-02 06:27:04 176.1M application/x-bzip
+ enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2
+ 2012-Nov-02 06:31:30 252.4M application/x-bzip
+ enwiki-latest-pages-articles11.xml-p001325001p001825000.bz2
+ 2012-Nov-02 06:32:20 262.5M application/x-bzip
+ enwiki-latest-pages-articles12.xml-p001825001p002425000.bz2
+ 2012-Nov-02 06:35:38 286.2M application/x-bzip
+ enwiki-latest-pages-articles13.xml-p002425002p003124997.bz2
+ 2012-Nov-02 06:37:45 270.5M application/x-bzip
+ enwiki-latest-pages-articles14.xml-p003125001p003924999.bz2
+ 2012-Nov-02 06:36:39 270.9M application/x-bzip
+ enwiki-latest-pages-articles15.xml-p003925001p004824998.bz2
+ 2012-Nov-02 06:39:35 261.5M application/x-bzip
+ enwiki-latest-pages-articles16.xml-p004825005p006024996.bz2
+ 2012-Nov-02 06:43:34 320.6M application/x-bzip
+ enwiki-latest-pages-articles17.xml-p006025001p007524997.bz2
+ 2012-Nov-02 06:46:14 328.5M application/x-bzip
+ enwiki-latest-pages-articles18.xml-p007525004p009225000.bz2
+ 2012-Nov-02 06:46:56 348.6M application/x-bzip
+ enwiki-latest-pages-articles19.xml-p009225002p011124997.bz2
+ 2012-Nov-02 06:45:55 327.9M application/x-bzip
+ enwiki-latest-pages-articles20.xml-p011125004p013324998.bz2
+ 2012-Nov-02 06:56:06 438.6M application/x-bzip
+ enwiki-latest-pages-articles21.xml-p013325003p015724999.bz2
+ 2012-Nov-02 06:57:01 459.1M application/x-bzip
+ enwiki-latest-pages-articles22.xml-p015725013p018225000.bz2
+ 2012-Nov-02 06:56:06 416.6M application/x-bzip
+ enwiki-latest-pages-articles23.xml-p018225004p020925000.bz2
+ 2012-Nov-02 06:59:46 489.5M application/x-bzip
+ enwiki-latest-pages-articles24.xml-p020925002p023724999.bz2
+ 2012-Nov-02 07:07:38 542.7M application/x-bzip
+ enwiki-latest-pages-articles25.xml-p023725001p026625000.bz2
+ 2012-Nov-02 07:01:45 573.2M application/x-bzip
+ enwiki-latest-pages-articles26.xml-p026625004p029624976.bz2
+ 2012-Nov-02 07:00:55 535.2M application/x-bzip
+ enwiki-latest-pages-articles27.xml-p029625017p037517228.bz2
+ 2012-Nov-02 08:15:33 1.4G application/x-bzip
+
+Python code
+------------
+You'll need the gensim package.
+
+1. parsewiki.py <num> reads "./a<num>bz2" wikipedia file and stores result
+ in "./a<num>"
+2. countwords.py <num> reads "./a<num>" file, filters words and stores
+ word occurances in "./db<num>" file (a pickled defaultdict(int))
+3. mergedb.py <from> <to> reads files "./db<from>" till "./db<to>", adds
+ word occurances and stores result in "./db" (pickled) and "./dbfile.py"
+ (python code - much faster)
+
@@ -0,0 +1,31 @@
+import sys
+import pickle
+from collections import defaultdict
+from gensim.corpora import wikicorpus as wc
+
+def readfile(num=1):
+ db = defaultdict(int)
+
+ f = open("./a" + str(num))
+ a = pickle.load(f)
+ f.close()
+
+ counter = 0
+ for text in a.get_texts():
+ for word in text:
+ if str.isalpha(word):
+ counter += 1
+ db[word] += 1
+
+ # filter infrequent words (<0.00005%)
+ T = int(5e-7 * counter)
+ for k,c in db.items():
+ if c < T:
+ del db[k]
+
+ f = open("./db" + str(num), "w")
+ pickle.dump(db, f)
+ f.close()
+
+if __name__ == "__main__":
+ readfile(int(sys.argv[1]))
@@ -0,0 +1,40 @@
+import sys
+import cPickle as pickle
+from collections import defaultdict
+
+def add_dict(db1, db2):
+ for k,v in db2.items():
+ db1[k] += v
+ return db1
+
+def merge(a=1, z=27):
+ dbs = []
+
+ for i in xrange(a, z+1):
+ print "loading db" + str(i)
+ f = open("./db" + str(i))
+ dbs.append(pickle.load(f))
+ f.close()
+
+ db = reduce(add_dict, dbs)
+
+ # filter really rare words (<0.000005%)
+ T = int(5e-8 * sum(db.itervalues()))
+ for k,c in db.items():
+ if c < T:
+ del db[k]
+
+ f = open("./db", "w")
+ pickle.dump(db, f)
+ f.close()
+
+ f = open("./dbfile.py", "w")
+ f.write(str(db))
+ f.close()
+
+ print "done"
+ print "(edit dbfile.py before using)"
+
+if __name__ == "__main__":
+ merge(int(sys.argv[1]), int(sys.argv[2]))
+
@@ -0,0 +1,10 @@
+from gensim.corpora import wikicorpus as wc
+
+def parse(i=1)
+ print "processing wikipedia file a",i
+ wiki = wc.WikiCorpus("./a" + str(i) + ".bz2")
+ wiki.save("./a" + str(i))
+
+if __name__ == "__main__":
+ parse(int(sys.argv[1]))
+
@@ -0,0 +1,25 @@
+from . import occdb
+from . import freqdb
+
+def freq(word, notfound=None):
+ """Return frequency of given word.
+ If not found, return notfound(word) or 0.0"""
+ if word in freqdb.db:
+ return freqdb.db[word]
+ elif notfound:
+ return notfound(word)
+ else:
+ return 0.0
+
+def occ(word):
+ """Return number of occurances of given word.
+ If not found, return notfound(word) or 0"""
+ if word in occdb.db:
+ return occdb.db[word]
+ elif notfound:
+ return notfound(word)
+ else:
+ return 0
+
+W = 105940.
+N = 2194549780.

Large diffs are not rendered by default.

Oops, something went wrong.

Large diffs are not rendered by default.

Oops, something went wrong.

0 comments on commit b0369b9

Please sign in to comment.