Skip to content

Commit

Permalink
added command line option to specify file, reorganized
Browse files Browse the repository at this point in the history
  • Loading branch information
johnsheehan committed Nov 14, 2011
1 parent 06aeff9 commit 0640e01
Showing 1 changed file with 83 additions and 55 deletions.
138 changes: 83 additions & 55 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,70 @@
# requires simplejson, oauth2
from oauthtwitter import OAuthApi
from optparse import OptionParser
from time import sleep
from config import oauth_config
import codecs
import oauth2 as oauth
import re

bios = []
word_count = {}

def main():
# parse command line options
parser = OptionParser()
parser.add_option("-s", "--source", dest="source",
help="if specified, use source file name")
(options, args) = parser.parse_args()

if options.source:
print "reading from %s" % options.source
with codecs.open(options.source, encoding="utf-8", mode="r") as bios_file:
bios.extend(map(unicode, bios_file.readlines()))

parse_bios()
else:
print "fetching from twitter"
fetch_and_store()
parse_bios()

# TODO
# count most common following word
# count most common preceding word

def parse_bios():
# loop through bio array and process each bio
print "processing %i bios" % len(bios)

# included some symbols to avoid writing more regex
stop_words = ".,\,:,/,a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,b,be,because,been,but,by,c,can,cannot,could,dear,did,do,does,either,e,else,ever,every,f,for,from,g,get,got,h,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,j,just,k,l,least,let,likely,m,may,me,might,most,must,my,neither,n,no,nor,not,o,of,off,often,oh,on,only,or,other,our,own,p,q,r,rather,s,said,say,says,she,should,since,so,some,t,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,u,us,v,w,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,x,y,yet,you,your,z".split(",")
stop_words = "0,1,2,3,4,5,6,7,8,9,_,+,&,@,.,\,:,/,a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,b,be,because,been,but,by,c,can,cannot,could,d,de,dear,did,do,does,either,e,else,ever,every,f,for,from,g,get,got,h,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,j,just,k,l,least,let,likely,m,may,me,might,most,must,my,neither,n,no,nor,not,o,of,off,often,oh,on,only,or,other,our,own,p,q,r,rather,s,said,say,says,she,should,since,so,some,t,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,u,us,v,w,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,x,y,yet,you,your,z".split(",")

for bio in bios:
clean = re.sub(ur"[^a-zA-Z0-9\#\+\.\/\,\-\_\&\:\?]", r" ", bio)
clean = re.sub(ur"(\.|\,|\-)\s", r" ", clean)
clean = re.sub(ur"\S+$", r"", clean)
clean = re.sub(ur"(\.|\,|\s)+", r"\1", clean)

# TODO remove , / \ : ' & - when whitespace on either side
# TODO remove , \ when no whitespace on both sides
clean = clean.strip()
if len(clean) == 0: continue

words = clean.split()
lower_words = map(unicode.lower, words)
for word in lower_words:
# exclude stop words
if word in stop_words: continue
word_count[word] = word_count.get(word, 0) + 1

# print out final counts
with codecs.open("results.txt", encoding="utf-8", mode="w") as results_file:
for w in sorted(word_count, key=word_count.get):
print w, word_count[w]
results_file.write("%s %s\n" % (w, word_count[w]))


def fetch_and_store():
# setup oauth
consumer_key = oauth_config["consumer_key"]
consumer_secret = oauth_config["consumer_secret"]
Expand All @@ -34,60 +90,32 @@ def main():

# put bios into array
# have to retrieve them 100 at a time from twitter
start, end = 0, 99
bios = []
word_count = {}
while end < len(ids): # TODO: will miss remainder if count % 100 != 0
print "fetching bios %s-%s" % (start, end)
current = ids[start:end]

id_list = ",".join(map(str, current))
try:
apiData = twitter.ApiCall("users/lookup", "GET", { 'user_id' : id_list })
except:
print "error! sleeping..."
sleep(5)
continue

# only advance counts if http succeeds
start = start + 100
end = end + 100

for user in apiData:
desc = user["description"]
if desc is None: continue

clean = re.sub(r"[^a-zA-Z0-9\#\+\.\/\\\,\-\_]", " ", desc)
clean = re.sub(r"(\. )|(\, )|(\- )|(\.$)", " ", clean)
clean = re.sub(r"\.+", ".", clean)
clean = re.sub(r" +", " ", clean)
# TODO remove , / \ : ' - when whitespace on either side
# TODO remove , / \ when no whitespace on both sides
clean = clean.strip()
if len(clean) == 0: continue
bios.append(clean)

# write bios to a file for later processing
with open("bios.txt", "w") as bios_file:
for bio in bios: bios_file.write("%s\n" % bio)

# loop through bio array and process each bio
print "processing bios"
for bio in bios:
words = bio.split()
lower_words = map(unicode.lower, words)
for word in lower_words:
# exclude stop words
if word in stop_words: continue
word_count[word] = word_count.get(word, 0) + 1

# print out final counts
for w in sorted(word_count, key=word_count.get):
print w, word_count[w]

# TODO
# count most common following word
# count most common preceding word
start, end = 0, 100
total = len(ids)

with codecs.open("bios.txt", encoding="utf-8", mode="w") as bios_file:
while start < total:
print "fetching bios %s-%s" % (start+1, end)
current = ids[start:end]

id_list = ",".join(map(str, current))
try:
apiData = twitter.ApiCall("users/lookup", "GET", { 'user_id' : id_list })
except:
print "error! sleeping..."
sleep(5)
continue

# only advance counts if http succeeds
start = start + 100
end = end + 100

# write bios to a file for later processing
for user in apiData:
desc = user["description"].strip()
if desc is None: continue
bios_file.write("%s\n" % desc)
bios.append(desc)

if __name__ == '__main__':
main()

0 comments on commit 0640e01

Please sign in to comment.