Permalink
Browse files

added analysis

  • Loading branch information...
1 parent a520338 commit 6e0ffbc66fe3d2fd32fb11be045ae2003f08e612 @kykamath committed Feb 13, 2012
Showing with 98 additions and 13 deletions.
  1. +40 −0 .gitignore
  2. +31 −0 hashtags_for_locations/analysis.py
  3. +17 −13 hashtags_for_locations/mr_analysis.py
  4. +10 −0 hashtags_for_locations/settings.py
View
40 .gitignore
@@ -0,0 +1,40 @@
+# Compiled source #
+###################
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.so
+*.pyc
+*.stream
+*.gr
+# Packages #
+############
+# it's better to unpack these files and commit the raw source
+# git has its own built in compression methods
+*.7z
+*.dmg
+*.iso
+*.jar
+*.rar
+*.tar
+*.zip
+
+# Logs and databases #
+######################
+*.log
+*.sql
+*.sqlite
+*.bbl
+*.aux
+*.blg
+*.pdf
+*.dvi
+
+# OS generated files #
+######################
+.DS_Store?
+ehthumbs.db
+Icon?
+Thumbs.db
View
31 hashtags_for_locations/analysis.py
@@ -0,0 +1,31 @@
+'''
+Created on Nov 19, 2011
+
+@author: kykamath
+'''
+from settings import hdfsInputFolder, hashtagsWithoutEndingWindowFile, hashtagsWithEndingWindowFile
+from datetime import datetime
+from dateutil.relativedelta import relativedelta
+from library.mrjobwrapper import runMRJob
+from mr_analysis import MRAnalysis
+from hashtags_for_locations.mr_analysis import PARAMS_DICT
+from library.file_io import FileIO
+
+def getInputFiles(startTime, endTime, folderType='world'):
+ current=startTime
+ while current<=endTime:
+ yield hdfsInputFolder%folderType+'%s_%s'%(current.year, current.month)
+ current+=relativedelta(months=1)
+
+def mr_analysis(startTime, endTime, outputFolder):
+ outputFile = hashtagsWithEndingWindowFile%outputFolder
+ runMRJob(MRAnalysis, outputFile, getInputFiles(startTime, endTime), jobconf={'mapred.reduce.tasks':300})
+ FileIO.writeToFileAsJson(PARAMS_DICT, outputFile)
+
+if __name__ == '__main__':
+ startTime, endTime, outputFolder = datetime(2011, 4, 1), datetime(2012, 1, 31), 'complete' # Complete duration
+# startTime, endTime, outputFolder = datetime(2011, 5, 1), datetime(2011, 12, 31), 'complete_prop' # Complete propagation duration
+# startTime, endTime, outputFolder = datetime(2011, 5, 1), datetime(2011, 10, 31), 'training' # Training duration
+# startTime, endTime, outputFolder = datetime(2011, 11, 1), datetime(2011, 12, 31), 'testing' # Testing duration
+
+ mr_analysis(startTime, endTime, outputFolder)
View
30 hashtags_for_locations/mr_analysis.py
@@ -7,7 +7,8 @@
from library.mrjobwrapper import ModifiedMRJob
from library.geo import getLatticeLid, getLattice, isWithinBoundingBox,\
getLocationFromLid, getHaversineDistance, getCenterOfMass
-import cjson, time, datetime
+import cjson, time
+from datetime import datetime
from collections import defaultdict
from itertools import groupby
import numpy as np
@@ -23,14 +24,18 @@
MIN_HASHTAG_OCCURENCES = 250
# Time windows.
-HASHTAG_STARTING_WINDOW = time.mktime(datetime.datetime(2011, 4, 1).timetuple())
-HASHTAG_ENDING_WINDOW = time.mktime(datetime.datetime(2012, 1, 31).timetuple())
-#HASHTAG_STARTING_WINDOW = time.mktime(datetime.datetime(2011, 5, 1).timetuple())
-#HASHTAG_ENDING_WINDOW = time.mktime(datetime.datetime(2011, 12, 31).timetuple())
-#HASHTAG_STARTING_WINDOW = time.mktime(datetime.datetime(2011, 5, 1).timetuple())
-#HASHTAG_ENDING_WINDOW = time.mktime(datetime.datetime(2011, 10, 31).timetuple())
-#HASHTAG_STARTING_WINDOW = time.mktime(datetime.datetime(2011, 11, 1).timetuple())
-#HASHTAG_ENDING_WINDOW = time.mktime(datetime.datetime(2011, 12, 31).timetuple())
+startTime, endTime = datetime(2011, 4, 1), datetime(2012, 1, 31) # Complete duration
+#startTime, endTime = datetime(2011, 5, 1), datetime(2011, 12, 31) # Complete propagation duration
+#startTime, endTime = datetime(2011, 5, 1), datetime(2011, 10, 31) # Training duration
+#startTime, endTime = datetime(2011, 11, 1), datetime(2011, 12, 31) # Testing duration
+HASHTAG_STARTING_WINDOW, HASHTAG_ENDING_WINDOW = time.mktime(startTime.timetuple()), time.mktime(endTime.timetuple())
+
+# Parameters for the MR Job that will be logged.
+PARAMS_DICT = dict(PARAMS_DICT = True,
+ LATTICE_ACCURACY=LATTICE_ACCURACY,
+ MIN_HASHTAG_OCCURENCES=MIN_HASHTAG_OCCURENCES,
+ HASHTAG_STARTING_WINDOW = HASHTAG_STARTING_WINDOW, HASHTAG_ENDING_WINDOW = HASHTAG_ENDING_WINDOW,
+ )
def iterateHashtagObjectInstances(line):
data = cjson.decode(line)
@@ -39,7 +44,6 @@ def iterateHashtagObjectInstances(line):
else: l = data['bb']
t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
point = getLattice(l, LATTICE_ACCURACY)
-# if isWithinBoundingBox(point, BOUNDING_BOX):
for h in data['h']: yield h.lower(), [point, t]
def getHashtagWithoutEndingWindow(key, values):
@@ -62,10 +66,10 @@ def getHashtagWithEndingWindow(key, values):
if numberOfInstances>=MIN_HASHTAG_OCCURENCES and \
e[1]>=HASHTAG_STARTING_WINDOW and l[1]<=HASHTAG_ENDING_WINDOW: return {'h': key, 't': numberOfInstances, 'e':e, 'l':l, 'oc': sorted(occurences, key=lambda t: t[1])}
-class MRAreaAnalysis(ModifiedMRJob):
+class MRAnalysis(ModifiedMRJob):
DEFAULT_INPUT_PROTOCOL='raw_value'
def __init__(self, *args, **kwargs):
- super(MRAreaAnalysis, self).__init__(*args, **kwargs)
+ super(MRAnalysis, self).__init__(*args, **kwargs)
self.hashtags = defaultdict(list)
''' Start: Methods to get hashtag objects
'''
@@ -93,4 +97,4 @@ def steps(self):
# return self.jobsToGetHastagObjectsWithoutEndingWindow()
if __name__ == '__main__':
- MRAreaAnalysis.run()
+ MRAnalysis.run()
View
10 hashtags_for_locations/settings.py
@@ -0,0 +1,10 @@
+'''
+Created on Feb 12, 2012
+
+@author: kykamath
+'''
+hdfsInputFolder = 'hdfs:///user/kykamath/geo/hashtags/%s/'
+
+analysisFolder = '/mnt/chevron/kykamath/data/geo/hashtags/hashtags_for_locations/%s/'
+hashtagsWithoutEndingWindowFile = analysisFolder+'hashtagsWithoutEndingWindow'
+hashtagsWithEndingWindowFile = analysisFolder+'hashtagsWithEndingWindow'

0 comments on commit 6e0ffbc

Please sign in to comment.