In [42]:
import math
import random
import pandas as pd
import gc
import datetime


# Ex 2.1 -> DGIM

### Creating the DGIM class

In [125]:
class DGIM():
    def __init__(self, N):
        self.N = N
        # the index is the power of the 2 corresponding to the number of ones in the buckets
        # Ex: 0: 2^0 = 1 -> the buckets only have one 1 each.
        self.bucket_list = {0: []}
        # current max power of 2, indicates the biggest current bucket size
        self.max_power = 0
        # helper flag used to discard elements that are N bits old
        self.flag = 0
        self.timestamp = 0
        
        
    def process_bit(self, bit):
        
        if self.timestamp > 0: # to ensure that we don't access a empty list
            if self.bucket_too_old():
                # remove the oldest bit
                self.bucket_list[self.max_power] = self.bucket_list[self.max_power][:-1]
                
                # if the largest (oldest) bucket becomes empty we remove it
                if len(self.bucket_list[self.max_power]) == 0:
                    self.bucket_list.pop(self.max_power)
                    self.max_power -= 1
                    
        if bit == 0:
            return
        
        self.update()
        self.timestamp += 1
        self.flag = self.timestamp % self.N
        
        return
    
    
    def update(self):
        power = 0
        
        # add to the begining of the list (works as a queue)
        self.bucket_list[0] = [[self.timestamp, self.flag]] + self.bucket_list[0]
        
        # when we have more than 2 buckets of the same size we 
        # need to combine them into greater power buckets
        # using while instead of for prevents us from iterating through all of the buckets
        while len(self.bucket_list[power]) > 2:
            # get the 2 oldest buckets to be merged
            bucket = self.bucket_list[power][-2:]
            # the newest element remains in the bucket
            self.bucket_list[power] = self.bucket_list[power][:1]
            power += 1
            
            # check if the next power already has a bucket
            if power in self.bucket_list:
                self.bucket_list[power] = [bucket[0]] + self.bucket_list[power]
            else:
                self.bucket_list[power] = [bucket[0]]
                self.max_power = power
                
        return
        
    
    def get_count(self):
        result = 0
        power = 0
        
        for x in range(self.max_power + 1):
            # number of buckets of size x, varies between 0 and 2
            bucket_len = len(self.bucket_list[x])
            if bucket_len > 0:
                result += bucket_len * math.pow(2, power)
            # increase the power of 2 (ex: 2^1 becomes 2^2)
            power += 1
        # since we only consider half of the last bucket 
        # we need to subtract half of it from the result
        result -= math.floor((2**self.max_power)/2)
            
        return int(result)
    
    def get_in_last_k(self, k):
        timepoint = self.timestamp - k
        result = 0
        power = 0
        # auxiliary parameters to remove half of B
        B_timestamp = math.inf
        B_power = 0
        
        for k in range(self.max_power + 1):
            for v in range(len(self.bucket_list[k])):
                if self.bucket_list[k][v][0] > timepoint:
                    result += math.pow(2, power)
                    # keep track of the earliest bucket that overlaps with k
                    if self.bucket_list[k][v][0] < B_timestamp:
                        B_timestamp = self.bucket_list[k][v][0]
                        B_power = power
                        
            power += 1
            
        # since we already summed all of B now we need to remove half of it
        result -= math.floor((2**B_power)/2)      
        
        return int(result)
    
    def bucket_too_old(self):
        # returns True or False
        # bucket_list[self.max_power][-1] -> last element of the oldest bucket, that is, the oldest element
        # this element is only equal to the value of the flag when N elements have gone through the algorithm
        return (self.flag == self.bucket_list[self.max_power][-1][1])
    
    
    

### Creating a function to simulate a stream

In [119]:
def test_DGIM(N=20000, k=2000, interval=2000):
    dgim = DGIM(N=N)
    counter = 0
    ones = 0
    end = 0
    
    while True:
        
        if (counter % interval) == 0:
            print(f'###################\nReal count: {ones}')
            print(f'DGIM estimate: {dgim.get_count()}')
            print(f'In the last {k} elements: {dgim.get_in_last_k(k)} \n')
        
        if counter == N:
            ones = 0
            counter = 0
        
        x = random.randint(0, 1)
        dgim.process_bit(x)
        
        if x == 1:
            ones += 1
            
        counter += 1   

### Testing the algorithm

In [127]:
test_DGIM(10000, 200, 1000)

###################
Real count: 0
DGIM estimate: 0
In the last 200 elements: 0 

###################
Real count: 506
DGIM estimate: 442
In the last 200 elements: 218 

###################
Real count: 1016
DGIM estimate: 888
In the last 200 elements: 216 

###################
Real count: 1513
DGIM estimate: 1257
In the last 200 elements: 201 

###################
Real count: 2019
DGIM estimate: 1763
In the last 200 elements: 195 

###################
Real count: 2499
DGIM estimate: 1987
In the last 200 elements: 259 

###################
Real count: 3000
DGIM estimate: 2488
In the last 200 elements: 248 

###################
Real count: 3509
DGIM estimate: 2997
In the last 200 elements: 245 

###################
Real count: 3997
DGIM estimate: 3485
In the last 200 elements: 221 

###################
Real count: 4486
DGIM estimate: 3462
In the last 200 elements: 198 

###################
Real count: 4966
DGIM estimate: 3942
In the last 200 elements: 198 

###################
Real count: 

KeyboardInterrupt: 

# Ex 2.2 -> Exponential Decaying Windows

In [65]:
class EDW():
    def __init__(self, c, threshold):
        self.counts = {}
        self.c = c
        self.threshold = threshold
        
        
    def process_stream(self, hashtags):
        
        for item in hashtags:    
            self.counts = {k: v*(1-self.c) for (k,v) in self.counts.items()}
             
            if item in self.counts:
                self.counts[item] += 1
            else:
                self.counts[item] = 1
        
            self.counts = {k: v for (k, v) in self.counts.items() if v > self.threshold}


    def get_top_10(self):
        top_10 = sorted(self.counts.items(), key=lambda x: x[1], reverse=True)[:10]
        print(top_10)
        
        
        

In [17]:
with open('data/mdle_twitter_data/2020.txt') as file:
    data = file.read().splitlines()
    data = [item.split(" ", 1) for item in data]
    data = [tuple((item[0], eval(item[1]))) for item in data]
    
with open('data/mdle_twitter_data/2021.txt') as file:
    data2 = file.read().splitlines()
    data2 = [item.split(" ", 1) for item in data2]
    data2 = [tuple((item[0], eval(item[1]))) for item in data2]
    
final_data = data + data2
del data
del data2
gc.collect()


In [79]:
# Interval should be in hours
# threshold determines how much counts we keep
def test_EDW(c=0.001, threshold=0.5, interval=1):
    
    current_date = datetime.datetime.strptime(final_data[0][0], '%Y-%m-%dT%H:%M:%S.%f%z')
    end_date = current_date + datetime.timedelta(hours=interval)
    
    edw = EDW(c=c, threshold=threshold)
    
    for x in range(len(final_data)):
        
        current_date = datetime.datetime.strptime(final_data[x][0], '%Y-%m-%dT%H:%M:%S.%f%z')
        edw.process_stream(final_data[x][1])
        
        if current_date > end_date:
            print(f'\nTop 10 hashtags at {end_date}: ')
            edw.get_top_10()
            end_date = end_date + datetime.timedelta(hours=interval)

In [80]:
# Interval should be in hours
test_EDW(c=0.001, threshold=0.5, interval=24)


Top 10 hashtags at 2020-02-12 15:14:58+00:00: 

[('COVID19', 73.81177633809325), ('coronavirus', 26.715790462757504), ('coronavírus', 18.54140180986142), ('covid19', 10.941559273570904), ('Coronavirus', 10.43665755191485), ('OMS', 7.419589548632905), ('Coronavírus', 6.09442625376464), ('China', 6.001467398433939), ('G1', 5.732514635784472), ('Covid', 5.379617281581597)]

Top 10 hashtags at 2020-02-13 15:14:58+00:00: 

[('COVID19', 121.78664759706919), ('coronavirus', 49.6044288531308), ('coronavírus', 36.505424958750226), ('covid19', 18.42881715861157), ('Coronavirus', 14.778352668663768), ('COVID', 10.483157426604444), ('China', 10.28428672111158), ('COVID2019', 10.116186533961853), ('Covid19', 8.563966450168566), ('Coronavírus', 7.561197559321979)]

Top 10 hashtags at 2020-02-14 15:14:58+00:00: 

[('COVID19', 135.25980615402838), ('coronavirus', 56.79934164114801), ('coronavírus', 41.53819074552001), ('covid19', 19.617751185134054), ('Coronavirus', 17.337233855274746), ('nCoV2019', 

KeyboardInterrupt: 