In [1]:
from collections import Counter, OrderedDict
from operator import itemgetter
import tempfile
import numpy as np
from codingChallenge import utils


class UniqueWordsCalculator(object):
    """
    Class container for the functions related to coungint the number of unique
    words as Tweets arrive
    """
    def __init__(self, tweet_iterable, path_to_input_file):
        self.tweet_iterable = tweet_iterable
        self.input_file_path = path_to_input_file

    def count_unique(self):
        """
        This is the function documentation
        """
        count_container = Counter()
        for tweet in self.tweet_iterable:
            # Encapsulte tweet in string call and return strip to
            # escape any strange characters
            count_container = count_container + \
                Counter(utils.clean_tweet(tweet).split(" "))

        sorted_count_dictionary = OrderedDict(sorted(count_container.items(),
                                              key=itemgetter(0)))

        # Remove edge cases of blank string or space string
        sorted_count_dictionary.pop(' ', None)
        sorted_count_dictionary.pop('', None)
        return sorted_count_dictionary

    def counter_on_all_words(self):
        with tempfile.TemporaryFile() as tmpfile:
            # Clean each tweet and write it out to the temporary file, with
            # a trailing newline
            for tweet in self.tweet_iterable:
                for word in utils.clean_tweet(tweet).split(" "):
                    tmpfile.write(word + "\n")
            # Make sure that the file is at the beginning and then create a
            # Counter from it to get the unique items
            tmpfile.seek(0)
            count_container = Counter(tmpfile.read().splitlines())

            sorted_count_dictionary = OrderedDict(sorted(count_container.items(),
                                              key=itemgetter(0)))
            return sorted_count_dictionary.items()

    def numpy_count_unique(self):
        tweet_array = np.genfromtxt(self.input_file_path,dtype=np.string_, comments=False, delimiter="\n")
        print tweet_array
        word_array = np.concatenate([tweet.split() for tweet in tweet_array])
        word_count = Counter(word_array)
        alphabetized_count = OrderedDict(sorted(word_count.items(), key=itemgetter(0)))
        return alphabetized_count.items()


    def run(self):
        """
        Create a generic run method on the object to make reimplemntation
        easier. That way code in the Dispatcher doesn't need to be refactored
        """
        # return self.count_unique()
        return self.numpy_count_unique()



In [10]:
    test_tweets = [
        "is #bigdata finally the answer to end poverty? \
        @lavanyarathnam http://ow.ly/o8gt3 #analytics",
        "interview: xia wang, astrazeneca on #bigdata and the promise of effective \
        healthcare #kdn http://ow.ly/ot2uj",
        "big data is not just for big business. on how #bigdata is being deployed for \
        small businesses: http://bddy.me/1bzukb3 @cxotodayalerts #smb"
    ]

In [20]:
def numpy_count_unique(input_text):
        tweet_array = np.genfromtxt(input_text, comments=False, dtype=np.string_, delimiter="\n")
        word_array = np.concatenate([tweet.split() for tweet in tweet_array])
        word_count = Counter(word_array)
        alphabetized_count = OrderedDict(sorted(word_count.items(), key=itemgetter(0)))
        return alphabetized_count.items()

In [22]:
with open("/Users/Jacob/Documents/Projects/DataInsight/codingChallenge/benchmark/data/5k_tweets.txt") as abc:
    a = numpy_count_unique(abc)
print a

<open file '/Users/Jacob/Documents/Projects/DataInsight/codingChallenge/benchmark/data/5k_tweets.txt', mode 'r' at 0x107316390>
[ "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"
 "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"
 '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'
 ..., '@gjarnling I am fine thanks - tired'
 'trying to keep my eyes open..damn baking' 'why the hell is it snowing']


In [308]:
with open("/Users/Jacob/Documents/Projects/DataInsight/codingChallenge/codingChallenge/tests/fixtures/tweet_input/tweets.txt") as test_file:
    tweet_array = np.genfromtxt(test_file, dtype=np.string_, delimiter=' ')
    print tweet_array
    word_array = np.concatenate([tweet.split() for tweet in tweet_array])
    word_count = Counter(word_array)
    alphabetized_count = OrderedDict(sorted(word_count.items(), key=itemgetter(0)))
    print alphabetized_count


ValueError: Some errors were detected !
    Line #2 (got 5 columns instead of 1)
    Line #3 (got 10 columns instead of 1)

In [84]:
a = np.genfromtxt("/Users/Jacob/Documents/Projects/DataInsight/codingChallenge/benchmark/data/1k_tweets.txt",
             dtype='|S160',
             delimiter="\n",
            )

def split_string(in_string):
    return in_string.split()
vfunc = np.vectorize(split_string, otypes=[np.string_])


In [113]:
b = Counter(a)

In [231]:
a = np.array([['a','b','c'], ['d','e','f','g']])

In [235]:
print a
print a.ravel()
np.concatenate(a)

[['a', 'b', 'c'] ['d', 'e', 'f', 'g']]
[['a', 'b', 'c'] ['d', 'e', 'f', 'g']]


array(['a', 'b', 'c', 'd', 'e', 'f', 'g'], 
      dtype='|S1')

In [233]:
print a

[['a', 'b', 'c'] ['d', 'e', 'f', 'g']]
