Skip to content

Commit

Permalink
modified experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
kykamath committed Nov 19, 2011
1 parent c1539e1 commit 6da6d97
Show file tree
Hide file tree
Showing 7 changed files with 1,070 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .pydevproject
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/hashtags_and_geo/src</path>
<path>/hashtags_and_geo</path>
</pydev_pathproperty>
</pydev_project>
3 changes: 3 additions & 0 deletions .settings/org.eclipse.ltk.core.refactoring.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#Sat Nov 19 06:05:28 CST 2011
eclipse.preferences.version=1
org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false
Empty file added analysis/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions analysis/experiments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
'''
Created on Nov 19, 2011
@author: kykamath
'''
56 changes: 56 additions & 0 deletions analysis/mr_experiments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
'''
Created on Nov 19, 2011
@author: kykamath
'''
from library.twitter import getDateTimeObjectFromTweetTimestamp
from library.mrjobwrapper import ModifiedMRJob
import cjson, time, datetime
from collections import defaultdict

MIN_HASHTAG_OCCURENCES = 25
HASHTAG_STARTING_WINDOW = datetime.datetime(2011, 2, 1)
HASHTAG_ENDING_WINDOW = datetime.datetime(2011, 11, 30)

def iterateHashtagObjectInstances(line):
data = cjson.decode(line)
l = data['geo']
t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
for h in data['h']: yield h, [l, t]

class MRExperiments(ModifiedMRJob):
DEFAULT_INPUT_PROTOCOL='raw_value'
def __init__(self, *args, **kwargs):
super(MRExperiments, self).__init__(*args, **kwargs)
self.hashtags = defaultdict(list)

''' Start: Methods to get hashtag objects
'''
def parse_hashtag_objects(self, key, line):
if False: yield # I'm a generator!
for h, d in iterateHashtagObjectInstances(line): self.hashtags[h].append(d)
def parse_hashtag_objects_final(self):
for h, instances in self.hashtags.iteritems(): # e = earliest, l = latest
yield h, {'oc': instances, 'e': min(instances, key=lambda t: t[1]), 'l': max(instances, key=lambda t: t[1])}
def combine_hashtag_instances(self, key, values):
occurences = []
e, l = None, None
for instances in values:
if e==None or e<instances['e'][1]: e = instances['e']
if l==None or l>instances['l'][1]: l = instances['l']
occurences+=instances['oc']
numberOfInstances=len(occurences)
if numberOfInstances>=MIN_HASHTAG_OCCURENCES and \
datetime.datetime.fromtimestamp(e[1])>=HASHTAG_STARTING_WINDOW and \
datetime.datetime.fromtimestamp(l[1])<=HASHTAG_ENDING_WINDOW:
yield key, {'h': key, 't': numberOfInstances, 'e':e, 'l':l, 'oc': occurences}
''' End: Methods to get hashtag objects
'''

def jobsToGetHastagObjects(self): return [self.mr(mapper=self.parse_hashtag_objects, mapper_final=self.parse_hashtag_objects_final, reducer=self.combine_hashtag_instances)]

def steps(self):
return self.jobsToGetHastagObjects() + self.jobsToCountNumberOfKeys()

if __name__ == '__main__':
MRExperiments.run()
1,000 changes: 1,000 additions & 0 deletions data/checkins

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
'''
Created on Nov 19, 2011
@author: kykamath
'''

0 comments on commit 6da6d97

Please sign in to comment.