Permalink
Browse files

modified experiments

  • Loading branch information...
1 parent c1539e1 commit 6da6d97770f0e76915aee191a3ca663a9e9b53a6 @kykamath committed Nov 19, 2011
View
@@ -5,6 +5,6 @@
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
-<path>/hashtags_and_geo/src</path>
+<path>/hashtags_and_geo</path>
</pydev_pathproperty>
</pydev_project>
@@ -0,0 +1,3 @@
+#Sat Nov 19 06:05:28 CST 2011
+eclipse.preferences.version=1
+org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false
View
No changes.
View
@@ -0,0 +1,5 @@
+'''
+Created on Nov 19, 2011
+
+@author: kykamath
+'''
View
@@ -0,0 +1,56 @@
+'''
+Created on Nov 19, 2011
+
+@author: kykamath
+'''
+from library.twitter import getDateTimeObjectFromTweetTimestamp
+from library.mrjobwrapper import ModifiedMRJob
+import cjson, time, datetime
+from collections import defaultdict
+
+MIN_HASHTAG_OCCURENCES = 25
+HASHTAG_STARTING_WINDOW = datetime.datetime(2011, 2, 1)
+HASHTAG_ENDING_WINDOW = datetime.datetime(2011, 11, 30)
+
+def iterateHashtagObjectInstances(line):
+ data = cjson.decode(line)
+ l = data['geo']
+ t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
+ for h in data['h']: yield h, [l, t]
+
+class MRExperiments(ModifiedMRJob):
+ DEFAULT_INPUT_PROTOCOL='raw_value'
+ def __init__(self, *args, **kwargs):
+ super(MRExperiments, self).__init__(*args, **kwargs)
+ self.hashtags = defaultdict(list)
+
+ ''' Start: Methods to get hashtag objects
+ '''
+ def parse_hashtag_objects(self, key, line):
+ if False: yield # I'm a generator!
+ for h, d in iterateHashtagObjectInstances(line): self.hashtags[h].append(d)
+ def parse_hashtag_objects_final(self):
+ for h, instances in self.hashtags.iteritems(): # e = earliest, l = latest
+ yield h, {'oc': instances, 'e': min(instances, key=lambda t: t[1]), 'l': max(instances, key=lambda t: t[1])}
+ def combine_hashtag_instances(self, key, values):
+ occurences = []
+ e, l = None, None
+ for instances in values:
+ if e==None or e<instances['e'][1]: e = instances['e']
+ if l==None or l>instances['l'][1]: l = instances['l']
+ occurences+=instances['oc']
+ numberOfInstances=len(occurences)
+ if numberOfInstances>=MIN_HASHTAG_OCCURENCES and \
+ datetime.datetime.fromtimestamp(e[1])>=HASHTAG_STARTING_WINDOW and \
+ datetime.datetime.fromtimestamp(l[1])<=HASHTAG_ENDING_WINDOW:
+ yield key, {'h': key, 't': numberOfInstances, 'e':e, 'l':l, 'oc': occurences}
+ ''' End: Methods to get hashtag objects
+ '''
+
+ def jobsToGetHastagObjects(self): return [self.mr(mapper=self.parse_hashtag_objects, mapper_final=self.parse_hashtag_objects_final, reducer=self.combine_hashtag_instances)]
+
+ def steps(self):
+ return self.jobsToGetHastagObjects() + self.jobsToCountNumberOfKeys()
+
+if __name__ == '__main__':
+ MRExperiments.run()
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -0,0 +1,5 @@
+'''
+Created on Nov 19, 2011
+
+@author: kykamath
+'''

0 comments on commit 6da6d97

Please sign in to comment.