In [5]:
import numpy as np
from codingChallenge import utils


class MedianCalculator(object):
    """
    Class used to abstract the calculation of the median
    """
    def __init__(self, tweet_input):
        self.tweet_iterable = []
        # Handle the edge case where a single tweet is supplied as a string,
        # not an array iterable
        if type(tweet_input) == str:
            self.tweet_iterable = [tweet_input]
        else:
            self.tweet_iterable = tweet_input
        self.num_unique_words_sorted = []
        self.median_list = []

    def format_to_two_decimal(self, input_amount):
        formatted_string = "{0:.2f}".format(input_amount)
        return float(formatted_string)

    def append_to_unique_word_list(self, tweet):
        tweet_unique_words = len(set(tweet.split()))
        self.num_unique_words_sorted.append(tweet_unique_words)
        self.num_unique_words_sorted = sorted(self.num_unique_words_sorted)

    def populate_median_list(self):
        for dirty_tweet in self.tweet_iterable:
            tweet = utils.clean_tweet(dirty_tweet)
            self.append_to_unique_word_list(tweet)
            current_number_of_tweets = len(self.num_unique_words_sorted)
            # Condition if odd amount
            if current_number_of_tweets % 2:
                index = current_number_of_tweets / 2
                self.median_list.append(self.num_unique_words_sorted[index])
            # Condition if even amount
            else:
                left_index = (current_number_of_tweets / 2) - 1
                right_index = current_number_of_tweets / 2

                average_of_medians = self.format_to_two_decimal(
                    (self.num_unique_words_sorted[left_index] +
                     self.num_unique_words_sorted[right_index]) / 2.0)
                self.median_list.append(average_of_medians)

        return self.median_list
    

    def run(self):
        """
        Run methods necessary to return the array of medians.

        The run method helps abstract away the calling of the methods so that
        less refacotring is needed later on.
        """
        return self.populate_median_list()






In [2]:
test_tweets = [
        "is #bigdata finally the answer to end poverty? \
        @lavanyarathnam http://ow.ly/o8gt3 #analytics",
        "interview: xia wang, astrazeneca on #bigdata and the promise of effective \
        healthcare #kdn http://ow.ly/ot2uj",
        "big data is not just for big business. on how #bigdata is being deployed for \
        small businesses: http://bddy.me/1bzukb3 @cxotodayalerts #smb"
    ]

In [3]:
b = MedianCalculator(test_tweets[0:2])

In [4]:
b.tweet_iterable

['is #bigdata finally the answer to end poverty?         @lavanyarathnam http://ow.ly/o8gt3 #analytics',
 'interview: xia wang, astrazeneca on #bigdata and the promise of effective         healthcare #kdn http://ow.ly/ot2uj']

In [188]:
def numpy_calculate_median(tweet_iterable):
    tweet_array = np.genfromtxt(tweet_iterable, dtype=np.string_, comments=False, delimiter="\n")
    tweet_bag_of_words = np.array([tweet.split() for tweet in tweet_array])
    running_list_of_uniques = []
    running_list_of_median = []
    for tweet in tweet_bag_of_words:
        filtered_tweet = (np.unique(tweet))
        running_list_of_uniques.append(len(filtered_tweet))
        running_list_of_median.append(np.median(running_list_of_uniques))
    return running_list_of_median

In [190]:
a = numpy_calculate_median(test_tweets)

[ 11.   12.5  14. ]


In [193]:
for x in a:
    print "{:10.2f}".format(x)

     11.00
     12.50
     14.00


In [115]:
np.concatenate((np.array(a), [6]))

array([5, 6])

In [60]:
def test_func(tweet_input):
    if type(tweet_input == str):
        return [tweet_input]
    else:
        return tweet_input

In [62]:
test_func(test_tweets[0:2])

[['is #bigdata finally the answer to end poverty?         @lavanyarathnam http://ow.ly/o8gt3 #analytics',
  'interview: xia wang, astrazeneca on #bigdata and the promise of effective         healthcare #kdn http://ow.ly/ot2uj']]

In [63]:
test_tweets[0:2]

['is #bigdata finally the answer to end poverty?         @lavanyarathnam http://ow.ly/o8gt3 #analytics',
 'interview: xia wang, astrazeneca on #bigdata and the promise of effective         healthcare #kdn http://ow.ly/ot2uj']

In [66]:
type(test_tweets[0:2]) == str

False