In [24]:
hash_tags = ['#gohawks','#gopatriots','#nfl','#patriots','#sb49','#superbowl']

In [25]:
import json

def getMinAndMaxTs(tag):
    filename = 'data/tweets_'+tag+'.txt'
    max_ts = 0
    min_ts = 1552522378
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            if(timestamp < min_ts):                
                min_ts = timestamp
            
            if(timestamp > max_ts):
                max_ts = timestamp
                
    return [min_ts,max_ts]

In [27]:
tagsToMinTs = {}
tagsToMaxTs = {}
for tag in hash_tags:
    ts_list = getMinAndMaxTs(tag)
    tagsToMinTs[tag] = (ts_list[0])
    tagsToMaxTs[tag] = (ts_list[1])    

In [28]:
MODE='FRESH'

import pickle

def save_object(data, fileName):
    with open(fileName + ".pickle", 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load_object(fileName):
    try:
        with open(fileName + ".pickle", 'rb') as f:
            data = pickle.load(f)
            return data
    except IOError:
        print("Could not read file: " + fileName)

In [33]:
import math
import datetime
import pytz


def getLocalHour(timestamp):
    d = datetime.datetime.fromtimestamp(timestamp)
    pst = pytz.timezone('America/Los_Angeles')
    d = pst.localize(d)
    return d.hour

def getWindowNumber(start_ts, curr_ts, window):
    elapsed = (curr_ts - start_ts)/(window*1.0)
    windowNum = math.ceil(elapsed)
    return windowNum    

def getFeatures(tag,start_ts,end_ts,window):
    windowToTweets = {}
    windowToRetweets = {}
    windowToFollowerCount = {}
    windowToMaxFollowers = {}
    features = []
    labels = []
    
    filename = 'data/tweets_'+tag+'.txt'
    with open(filename) as f:
        for line in f:
            json_object = json.loads(line)
            timestamp = json_object['citation_date']
            
            if timestamp < start_ts or timestamp > end_ts:                            
                continue
                
            key = getWindowNumber(start_ts,timestamp,window)
            if key not in windowToTweets.keys():
                windowToTweets[key]=0
            windowToTweets[key]+=1
            
            retweetCount = json_object['metrics']['citations']['total']        
            
            if key not in windowToRetweets.keys():
                windowToRetweets[key]=0
            windowToRetweets[key]+=retweetCount
        
            followerCount = json_object['author']['followers']
            if key not in windowToFollowerCount.keys():
                windowToFollowerCount[key]=0
            windowToFollowerCount[key]+=followerCount
        
            if key not in windowToMaxFollowers.keys():
                windowToMaxFollowers[key]=0
            windowToMaxFollowers[key] = max(windowToMaxFollowers[key],followerCount)            
            
        for period in range(start_ts,end_ts,window):
            key = period
            tweetCount = windowToTweets.get(key, 0)
            retweetCount = windowToRetweets.get(key,0)
            followerCount = windowToFollowerCount.get(key,0)
            maxFollowers = windowToMaxFollowers.get(key,0)

            h = getLocalHour(key)
            
            feature = [tweetCount, retweetCount, followerCount, maxFollowers, h]
            features.append(feature)
                
            nextKey = period + window
            labels.append(windowToTweets.get(nextKey,0))
                
    return features,labels

1. Before Feb. 1, 8:00 a.m.: 1-hour window
2. Between Feb. 1, 8:00 a.m. and 8:00 p.m.: 5-minute window 
3. After Feb. 1, 8:00 p.m.: 1-hour window


In [None]:
for tag in hash_tags:
    print("Started {}".format(tag))
    tp1_window_size = 3600 # 1 hour window size
    tp1_start_ts = tp1_window_size * math.floor(tagsToMinTs[tag]/(tp1_window_size*1.0))
    tp1_end_ts = 1422806400
    features,labels = getFeatures(tag,tp1_start_ts,tp1_end_ts,tp1_window_size)
    print(len(features))
    save_object(features, "tp1_features_{}".format(tag))
    save_object(labels, "tp1_labels_{}".format(tag))
    print("Finished {}".format(tag))

Started #gohawks
440
Finished #gohawks
Started #gopatriots
439
Finished #gopatriots
Started #nfl
440
Finished #nfl
Started #patriots
440
Finished #patriots
Started #sb49
436
Finished #sb49
Started #superbowl


In [None]:
#tp2
for tag in hash_tags:
    print("Started {}".format(tag))
    tp2_window_size = 300 # 5 minute window size
    tp2_start_ts = 1422806400
    tp2_end_ts = 1422849600
    features,labels = getFeatures(tag,tp2_start_ts,tp2_end_ts,tp2_window_size)
    save_object(features, "tp2_features_{}".format(tag))
    save_object(labels, "tp2_labels_{}".format(tag))
    print("Finished {}".format(tag))

In [21]:
#tp3
for tag in hash_tags:
    print("Started {}".format(tag))
    tp3_window_size = 3600 # 1 hour window size
    tp3_start_ts = 1422849600
    tp3_end_ts = tp3_window_size * math.ceil(tagsToMaxTs[tag]/(tp3_window_size*1.0))
    features,labels = getFeatures(tag,tp3_start_ts,tp3_end_ts,tp3_window_size)
    save_object(features, "tp3_features_{}".format(tag))
    save_object(labels, "tp3_labels_{}".format(tag))
    print("Finished {}".format(tag))

43200