## Import

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import sys
import json
from datetime import datetime
from tqdm import tqdm
import pytz
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
# Training and Testing directories
training_dir = os.path.join("Datasets", "Training")
testing_dir = os.path.join("Datasets", "Testing")
if not os.path.isdir(training_dir):
    raise Exception("ERROR: training dataset not found")
if not os.path.isdir(testing_dir):
    raise Exception("ERROR: testing dataset not found")

In [3]:
# iterate over all hashtag files 
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        print(os.path.join(root, file))

Datasets/Training/tweets_#gohawks.txt
Datasets/Training/tweets_#gopatriots.txt
Datasets/Training/tweets_#nfl.txt
Datasets/Training/tweets_#patriots.txt
Datasets/Training/tweets_#sb49.txt
Datasets/Training/tweets_#superbowl.txt


In [4]:
# Initialize dictionaries and Unix times for Feb 1, 8 am and Feb 1, 8 pm. 
# Dictionary keys: hashtag.
# Dictionary values: [time of tweet (Unix), number of retweets for tweet, number of followers for tweeter]
# Each row in dictionary value is an individual tweet.

hashtag_dict_before = {}
hashtag_dict_during = {}
hashtag_dict_after = {}
start_unix_time = 1422806400 # 8 am, Feb 1, PST
end_unix_time = 1422849600 # 8 pm, Feb 1, PST
pst_tz = pytz.timezone('America/Los_Angeles')


In [5]:
""" Parse files to get necessary data """

for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        filename = os.path.splitext(file)[0].replace('tweets_#', '')
        print('Parsing {}...'.format(filename))
        
        hashtag_dict_before[filename] = []
        hashtag_dict_during[filename] = []
        hashtag_dict_after[filename] = []
        
        # open the file and read all lines:
        with open(os.path.join(root, file), "r", encoding="utf-8") as hashtag:
            # read line-by-line
            for line in hashtag:
                json_obj = json.loads(line)
                
                # get desired statistics
                citation_date = json_obj['citation_date'] # Unix time
                num_retweets = json_obj['metrics']['citations']['total'] # Number of retweets for this tweet
                num_followers = json_obj['author']['followers'] # Number of followers for tweeter
                
                # Check when tweet was made and add it to corresponding dictionary
                if citation_date < start_unix_time:
                    hashtag_dict_before[filename].append([citation_date, num_retweets, num_followers])
                elif citation_date > end_unix_time:
                    hashtag_dict_after[filename].append([citation_date, num_retweets, num_followers])
                else:
                    hashtag_dict_during[filename].append([citation_date, num_retweets, num_followers])
    print('done')

Parsing gohawks...
Parsing gopatriots...
Parsing nfl...
Parsing patriots...
Parsing sb49...
Parsing superbowl...
done


## Organize Data

##### Variables:
<span>
key = one of the hashtags <br \> <br \>
</span>

<span>
data_hashtag_before[key] = data before 2/1 8am, split into 1-hour windows (separated by hashtag) <br \>
data_hashtag_during[key] = data between 2/1 8am and 8pm, split into 5-min windows (separated by hashtag) <br \>
data_hashtag_after[key] = data after 2/1 8pm, split into 1-hour windows (separated by hashtag) <br \> <br \>
</span>

<span>
data_aggregate_before = data before 2/1 8am, split into 1-hour windows (all hashtags combined) <br \>
data_aggregate_during = data between 2/1 8am and 8pm, split into 5-min windows (all hashtags combined) <br \>
data_aggregate_after = data after 2/1 8pm, split into 1-hour windows (all hashtags combined) <br \> <br \>
</span>

<span>
data_hashtag_all[key] = all data, split into 1-hour windows (separated by hashtag) <br \> <br \>
</span>

<span>
data_all = all data, split into 1-hour windows (all hashtags combined) <br \>
</span>

In [6]:
# Explicitly list hashtags. 
# Convert each value in dictionary to numpy arrays.

hashtags = ['gohawks', 'gopatriots', 'nfl', 'patriots', 'sb49', 'superbowl']

for key in hashtags:
    hashtag_dict_before[key] = np.array(hashtag_dict_before[key])
    hashtag_dict_during[key] = np.array(hashtag_dict_during[key])
    hashtag_dict_after[key] = np.array(hashtag_dict_after[key])

In [7]:
# Find how many time windows there are

ftt = int(np.min([np.min(hashtag_dict_before[key][:,0]) for key in hashtags])) # first tweet time
ltt = int(np.max([np.max(hashtag_dict_after[key][:,0]) for key in hashtags])) # last tweet time

num_windows_before = int(np.max([((start_unix_time - ftt) // 3600) + 1 for key in hashtags]))
num_windows_during = int(np.max([((end_unix_time - start_unix_time) // 3600 * 12) for key in hashtags]))
num_windows_after = int(np.max([((ltt - end_unix_time) // 3600) + 1 for key in hashtags]))


In [8]:
""" Organize data into specific time periods:
     before 2/1 8am with 1-hour windows, 
     between 2/1 8am and 2/1 8pm with 5-min windows,
     and after 2/1 8pm with 1-hour windows """

# Initialize dictionary for each time frame.
data_hashtag_before = {}
data_hashtag_during = {}
data_hashtag_after = {}

# Iterate through each hashtag.
for key in hashtags:
    print(key)
    
    # Rename the dictionary value for readability
    temp_before = hashtag_dict_before[key]
    temp_during = hashtag_dict_during[key]
    temp_after = hashtag_dict_after[key]
    
    data_hashtag_before[key] = np.zeros((num_windows_before, 5)) # Initialize array: rows = time window, columns = feature
    num_followers_before = {} # Initialize dictionary to count # of followers for each tweet
    
    
    
    # Iterate through all elements before start time
    for i in range(np.shape(temp_before)[0]):
        # Get row number
        item_before = int(num_windows_before - 1 - ((start_unix_time - temp_before[i,0] - 1) // 3600))
        # Update first 3 elements (# of tweets, total # retweets, total # followers)
        data_hashtag_before[key][item_before] += np.array([1, int(temp_before[i, 1]), int(temp_before[i, 2]), 0, 0])
        # Get time of day (hour)
        dt_obj_pst = datetime.fromtimestamp(temp_before[i,0], pst_tz)
        data_hashtag_before[key][item_before][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        # Get number of followers
        if item_before not in num_followers_before.keys():
            num_followers_before[item_before] = []
        num_followers_before[item_before].append(temp_before[i,2])
    for i in num_followers_before.keys():
        data_hashtag_before[key][i][3] = np.max(num_followers_before[i])
        
        
    # Iterate through all elements during time
    data_hashtag_during[key] = np.zeros((num_windows_during, 5))
    num_followers_during = {}
    for i in range(np.shape(temp_during)[0]):
        item_during = int(((temp_during[i,0] - start_unix_time) * 12) // 3600)
        data_hashtag_during[key][item_during] += np.array([1, int(temp_during[i, 1]), int(temp_during[i, 2]), 0, 0])
        dt_obj_pst = datetime.fromtimestamp(temp_during[i,0], pst_tz)
        data_hashtag_during[key][item_during][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_during not in num_followers_during.keys():
            num_followers_during[item_during] = []
        num_followers_during[item_during].append(temp_during[i,2])
    for i in num_followers_during.keys():
        data_hashtag_during[key][i][3] = np.max(num_followers_during[i])
        
    # Iterate through all elements after end time
    data_hashtag_after[key] = np.zeros((num_windows_after, 5))
    num_followers_after = {}
    for i in range(np.shape(temp_after)[0]):
        item_after = int((temp_after[i,0] - end_unix_time) // 3600)
        data_hashtag_after[key][item_after] += np.array([1, int(temp_after[i, 1]), int(temp_after[i, 2]), 0, 0])
        dt_obj_pst = datetime.fromtimestamp(temp_after[i,0], pst_tz)
        data_hashtag_after[key][item_after][4] = int(datetime.strftime(dt_obj_pst, '%H'))
        
        if item_after not in num_followers_after.keys():
            num_followers_after[item_after] = []
        num_followers_after[item_after].append(temp_after[i,2])
    for i in num_followers_after.keys():
        data_hashtag_after[key][i][3] = np.max(num_followers_after[i])
        
print('done')

gohawks
gopatriots
nfl
patriots
sb49
superbowl
done


In [9]:
""" Aggregate data within each time period by combining all hashtags. """

# Initialize aggregated data variables
data_aggregate_before = np.zeros([num_windows_before, 5])
data_aggregate_during = np.zeros([num_windows_during, 5])
data_aggregate_after = np.zeros([num_windows_after, 5])

# Sum the # of tweets, total # of retweets, and # of followers
for key in hashtags:
    data_aggregate_before[:,0:3] += data_hashtag_before[key][:,0:3]
    data_aggregate_during[:,0:3] += data_hashtag_during[key][:,0:3]
    data_aggregate_after[:,0:3] += data_hashtag_after[key][:,0:3]
# Find the max # of followers for each
data_aggregate_before[:,3] = np.amax([data_hashtag_before[key][:,3] for key in hashtags], axis=0)
data_aggregate_during[:,3] = np.amax([data_hashtag_during[key][:,3] for key in hashtags], axis=0)
data_aggregate_after[:,3] = np.amax([data_hashtag_after[key][:,3] for key in hashtags], axis=0)

# Copy over the same time frames
data_aggregate_before[:,4] = data_hashtag_before['superbowl'][:,4]
data_aggregate_during[:,4] = data_hashtag_during['superbowl'][:,4]
data_aggregate_after[:,4] = data_hashtag_after['superbowl'][:,4]

In [10]:
""" Get data for the whole time frame with 1-hour windows, separated by hashtag """

# Initialize dictionary to store data.
# Key: hashtag
# Value: data separated by 1-hour time windows
data_hashtag_all = {}

for key in hashtags: # Iterate through all hashtags
    temp_during = np.zeros([12, 5]) # Initialize array to store data in the middle time period
    # Combine data in the middle time period
    for i in range(np.shape(data_hashtag_during[key])[0]):
        hour = int(data_hashtag_during[key][i,4] - 8)
        temp_during[hour, :3] += data_hashtag_during[key][i, :3]
        if not i % 12:
            temp_during[hour, 3] = np.max(data_hashtag_during[key][i:(i+12), 3])
            temp_during[hour, 4] = data_hashtag_during[key][i,4]
    data_hashtag_all[key] = np.vstack((data_hashtag_before[key], temp_during, data_hashtag_after[key]))

In [11]:
""" Combine data for the whole time frame from all hashtags """

data_all = np.zeros([587, 5])
for key in hashtags:
    data_all += data_hashtag_all[key]

## Neural Network

In [12]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [13]:
# Define layer sizes.
layer_sizes = [(50, 50), (100, 100), (100, 100, 100), (100, 100, 100, 100), 10*(50,), 10*(100,)]

In [14]:
# Function to get MSE for a given neural network model.

def analyze_nn(nn, X, y):
    mses_per_fold = []
    mses_per_fold_train = []
    kf = KFold(10)
    for trainset, testset in kf.split(X):
        X_train, y_train = X[trainset], y[trainset]
        X_test, y_test = X[testset], y[testset]
        nn.fit(X_train, y_train)
        predicted = nn.predict(X_test)
        mses_per_fold.append(mean_squared_error(y_test, predicted))
        mses_per_fold_train.append(mean_squared_error(y_train, nn.predict(X_train)))
    #     print(mean_squared_error(y_test, predicted))
    avg_mse = np.mean(mses_per_fold)
    avg_mse_train = np.mean(mses_per_fold_train)
    print('Layer size {} MSE:\n   val = {} \n train = {}'.format(size, np.around(avg_mse, 2), np.around(avg_mse_train, 2)))
    
    return avg_mse, avg_mse_train

#### No preprocessing

In [15]:
# Analyze tweets before start time.

# Define train data and targets
y = data_all[1:,0] # Number of tweets (except first)
X = np.delete(data_all, -1, 0) # Delete last row

# X = X_before
# y = y_before

print('X shape:', X.shape)
print('y shape:', y.shape)

mses = []
mses_train = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse, avg_mse_train = analyze_nn(nn, X, y)
    mses.append(avg_mse)
    mses_train.append(avg_mse_train)

X shape: (586, 5)
y shape: (586,)
Layer size (50, 50) MSE:
   val = 59745874037.11 
 train = 9108725423.43
Layer size (100, 100) MSE:
   val = 3508811621.05 
 train = 281766647889.0
Layer size (100, 100, 100) MSE:
   val = 10658749042.71 
 train = 30772545538.35
Layer size (100, 100, 100, 100) MSE:
   val = 8591186972.54 
 train = 8205157949.7
Layer size (50, 50, 50, 50, 50, 50, 50, 50, 50, 50) MSE:
   val = 692571491.51 
 train = 8027511120.38
Layer size (100, 100, 100, 100, 100, 100, 100, 100, 100, 100) MSE:
   val = 1038210307.55 
 train = 2809171097.29


In [16]:
print(np.sqrt(mses))

[244429.69139838  59235.22280747 103241.21775098  92688.65611571
  26316.75305792  32221.27104179]


#### Standard scaler preprocessing

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [19]:
y = data_all[1:,0] # Number of tweets (except first)
X = np.delete(data_all, -1, 0) # Delete last row

X = scaler.fit_transform(X) # Transform data
y = y

print('X shape:', X.shape)
print('y shape:', y.shape)

mses = []
mses_train = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse, avg_mse_train = analyze_nn(nn, X, y)
    mses.append(avg_mse)
    mses_train.append(avg_mse_train)

X shape: (586, 5)
y shape: (586,)
Layer size (50, 50) MSE:
   val = 780638931.66 
 train = 644161652.92
Layer size (100, 100) MSE:
   val = 662703464.28 
 train = 376914873.93
Layer size (100, 100, 100) MSE:
   val = 261528210.03 
 train = 231483313.9
Layer size (100, 100, 100, 100) MSE:
   val = 319585942.97 
 train = 189917573.43
Layer size (50, 50, 50, 50, 50, 50, 50, 50, 50, 50) MSE:
   val = 677650603.25 
 train = 109547125.55
Layer size (100, 100, 100, 100, 100, 100, 100, 100, 100, 100) MSE:
   val = 697102371.67 
 train = 70324031.67


In [20]:
print(np.sqrt(mses))

[27939.91645767 25743.02748859 16171.83384873 17876.96682795
 26031.7230173  26402.69629553]


In [21]:
y = data_all[1:,0] # Number of tweets (except first)
X = np.delete(data_all, -1, 0) # Delete last row

X = scaler.fit_transform(X) # Transform data
y = y

layer_sizes = [(50,50,50), (100,100,100), (150,150,150), (200,200,200)]

print('X shape:', X.shape)
print('y shape:', y.shape)

mses = []
mses_train = []
for size in layer_sizes:
    nn = MLPRegressor(hidden_layer_sizes=size, activation='relu', solver='adam', alpha=0.001)

    avg_mse, avg_mse_train = analyze_nn(nn, X, y)
    mses.append(avg_mse)
    mses_train.append(avg_mse_train)

X shape: (586, 5)
y shape: (586,)
Layer size (50, 50, 50) MSE:
   val = 490386407.23 
 train = 246093184.54
Layer size (100, 100, 100) MSE:
   val = 262890774.44 
 train = 230559564.49
Layer size (150, 150, 150) MSE:
   val = 270196558.47 
 train = 219096771.69
Layer size (200, 200, 200) MSE:
   val = 278820576.41 
 train = 207854681.98


In [22]:
print(np.sqrt(mses))

[22144.66995076 16213.90682217 16437.65672081 16697.92131998]


#### Grid search

In [13]:
from sklearn.model_selection import GridSearchCV

In [17]:
y_before = data_aggregate_before[1:,0] # Number of tweets (except first)
X_before = np.delete(data_aggregate_before, -1, 0) # Delete last row

X = scaler.fit_transform(X_before) # Transform data
y = y_before

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,), (300,),
                           2*(50,), 2*(100,), 2*(150,), 2*(200,), 2*(250,), 2*(300,), 
                           3*(50,), 3*(100,), 3*(150,), 3*(200,), 3*(250,), 3*(300,),
                           4*(50,), 4*(100,), 4*(150,), 4*(200,), 4*(250,), 4*(300,),
                           5*(50,), 5*(100,), 5*(150,), 5*(200,), 5*(250,), 5*(300,)]
}

nn = MLPRegressor(activation='relu', solver='adam', alpha=0.001)
clf_before = GridSearchCV(nn, param_grid, cv=5, scoring='neg_mean_squared_error')
clf_before.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,), (300,), (50, 50), (100, 100), (150, 150), (200, 200), (250, 250), (300, 300), (50, 50, 50), (100, 100, 100), (150, 150, 150), (200, 200, 200), (250, 250, 250), (300, 300, 300), (50, 50, 50, 50), (100, 100, 100, 100), (150, 150...0, 150, 150, 150), (200, 200, 200, 200, 200), (250, 250, 250, 250, 250), (300, 300, 300, 300, 300)]},
       pre_dispa

In [18]:
print(clf_before.best_params_)
print(clf_before.best_score_)

{'hidden_layer_sizes': (200, 200)}
-4680478.984374924


In [19]:
y_during = data_aggregate_during[1:,0] # Number of tweets (except first)
X_during = np.delete(data_aggregate_during, -1, 0) # Delete last row

X = scaler.fit_transform(X_during) # Transform data
y = y_during

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,), (300,),
                           2*(50,), 2*(100,), 2*(150,), 2*(200,), 2*(250,), 2*(300,), 
                           3*(50,), 3*(100,), 3*(150,), 3*(200,), 3*(250,), 3*(300,),
                           4*(50,), 4*(100,), 4*(150,), 4*(200,), 4*(250,), 4*(300,),
                           5*(50,), 5*(100,), 5*(150,), 5*(200,), 5*(250,), 5*(300,)]
}

nn = MLPRegressor(activation='relu', solver='adam', alpha=0.001)
clf_during = GridSearchCV(nn, param_grid, cv=5, scoring='neg_mean_squared_error')
clf_during.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,), (300,), (50, 50), (100, 100), (150, 150), (200, 200), (250, 250), (300, 300), (50, 50, 50), (100, 100, 100), (150, 150, 150), (200, 200, 200), (250, 250, 250), (300, 300, 300), (50, 50, 50, 50), (100, 100, 100, 100), (150, 150...0, 150, 150, 150), (200, 200, 200, 200, 200), (250, 250, 250, 250, 250), (300, 300, 300, 300, 300)]},
       pre_dispa

In [20]:
print(clf_during.best_params_)
print(clf_during.best_score_)

{'hidden_layer_sizes': (250, 250, 250, 250, 250)}
-37522544.31218094


In [21]:
y_after = data_aggregate_after[1:,0] # Number of tweets (except first)
X_after = np.delete(data_aggregate_after, -1, 0) # Delete last row

X = scaler.fit_transform(X_after) # Transform data
y = y_after

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,), (300,),
                           2*(50,), 2*(100,), 2*(150,), 2*(200,), 2*(250,), 2*(300,), 
                           3*(50,), 3*(100,), 3*(150,), 3*(200,), 3*(250,), 3*(300,),
                           4*(50,), 4*(100,), 4*(150,), 4*(200,), 4*(250,), 4*(300,),
                           5*(50,), 5*(100,), 5*(150,), 5*(200,), 5*(250,), 5*(300,)]
}

nn = MLPRegressor(activation='relu', solver='adam', alpha=0.001)
clf_after = GridSearchCV(nn, param_grid, cv=5, scoring='neg_mean_squared_error')
clf_after.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,), (300,), (50, 50), (100, 100), (150, 150), (200, 200), (250, 250), (300, 300), (50, 50, 50), (100, 100, 100), (150, 150, 150), (200, 200, 200), (250, 250, 250), (300, 300, 300), (50, 50, 50, 50), (100, 100, 100, 100), (150, 150...0, 150, 150, 150), (200, 200, 200, 200, 200), (250, 250, 250, 250, 250), (300, 300, 300, 300, 300)]},
       pre_dispa

In [22]:
print(clf_after.best_params_)
print(clf_after.best_score_)

{'hidden_layer_sizes': (150, 150, 150, 150, 150)}
-900373.8765929521


## Load test data

In [30]:
testing_dir = os.path.join("Datasets", "Testing")
pst_tz = pytz.timezone('America/Los_Angeles')

# iterate over all hashtag files 
for root, dirs, files in os.walk(testing_dir, topdown=False):
    for file in files:
        print(os.path.join(root, file))

Datasets/Testing/sample0_period1.txt
Datasets/Testing/sample0_period2.txt
Datasets/Testing/sample0_period3.txt
Datasets/Testing/sample1_period1.txt
Datasets/Testing/sample1_period2.txt
Datasets/Testing/sample1_period3.txt
Datasets/Testing/sample2_period1.txt
Datasets/Testing/sample2_period2.txt
Datasets/Testing/sample2_period3.txt


In [31]:
""" Parse files to get necessary data """

data_test_tweets = {}

for root, dirs, files in os.walk(testing_dir, topdown=False):
    for file in files:
        filename = os.path.splitext(file)[0].replace('tweets_#', '')
        print('Parsing {}...'.format(filename))
        
        data_test_tweets[filename] = []
        
        # open the file and read all lines:
        with open(os.path.join(root, file), "r", encoding="utf-8") as hashtag:
            # read line-by-line
            for line in hashtag:
                json_obj = json.loads(line)
                
                # get desired statistics
                citation_date = json_obj['citation_date'] # Unix time
                num_retweets = json_obj['metrics']['citations']['total'] # Number of retweets for this tweet
                num_followers = json_obj['author']['followers'] # Number of followers for tweeter
                
                data_test_tweets[filename].append([citation_date, num_retweets, num_followers])

    print('done')

Parsing sample0_period1...
Parsing sample0_period2...
Parsing sample0_period3...
Parsing sample1_period1...
Parsing sample1_period2...
Parsing sample1_period3...
Parsing sample2_period1...
Parsing sample2_period2...
Parsing sample2_period3...
done


In [32]:
# Explicitly list out file names
filenames = ['sample0_period1', 'sample0_period2', 'sample0_period3', 
             'sample1_period1', 'sample1_period2', 'sample1_period3', 
             'sample2_period1', 'sample2_period2', 'sample2_period3']

# Make each value a numpy array
for key in filenames:
    data_test_tweets[key] = np.array(data_test_tweets[key])

In [33]:
""" Organize the test set by file name into the correct format. Each value is n x 5, 
    where n is the number of time windows, and 5 is the number of features (same as above). """

# Initialize dictionary. Key is file name, value is a numpy array.
data_test = {}
for key in filenames: # Iterate through each test file
    print(key)
    temp = data_test_tweets[key] # Rename temp for simplicity
    data_test[key] = np.zeros((6,5)) # Initialize array as zeros
    
    ftt = np.min(temp[:,0]) # Find (Unix) time of first tweet
    start_time = ftt - (ftt % 3600) # Find the hour of the first tweet.
    num_followers = {} # Initialize dictionary, to be used later to find max number of followers.
    
    for i in range(np.shape(temp)[0]): # Iterate through all tweets in specific file. 
        if int(key[-1]) == 2: # Check if the file is tweets from period 2.
            item = int(((temp[i,0] - start_time) * 12) // 3600 - 6) # Find index number
        else: # Period 1 or 3
            item = int((temp[i,0] - start_time) // 3600)
        data_test[key][item] += np.array([1, temp[i,1], temp[i,2], 0, 0]) # Update first 3 elements

        dt_obj_pst = datetime.fromtimestamp(temp[i,0], pst_tz) 
        data_test[key][item][4] = int(datetime.strftime(dt_obj_pst, '%H')) # Find hour of tweet time
        if item not in num_followers.keys():
            num_followers[item] = []
        num_followers[item].append(temp[i,2])
    for i in num_followers.keys():
        data_test[key][i][3] = np.max(num_followers[i]) # Update max number of followers.
        
print('done')

sample0_period1
sample0_period2
sample0_period3
sample1_period1
sample1_period2
sample1_period3
sample2_period1
sample2_period2
sample2_period3
done


## Analyze test set

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [35]:
""" Perform regression on the test set to predict the number 
    of tweets in the next time window (either 1 hour or 5 minutes) """

for key in filenames: # Iterate through all files
    print(key)
    lr = LinearRegression() # Instantiate a linear regressor
    knr = KNeighborsRegressor(n_neighbors=6) # Instantiate a k nearest neighbors regressor
    
    # Create train set and train labels
    if key[-1] == '1': # Check if file is from period 1
        y = data_aggregate_before[1:, 0] # Get # of tweets for next time window
        X = np.delete(data_aggregate_before, -1, 0) # Get all training points except last row
    elif key[-1] == '2': # Check f file is from period 2
        y = data_aggregate_during[1:, 0]
        X = np.delete(data_aggregate_during, -1, 0)
    else: # Else file is from period 3
        y = data_aggregate_after[1:, 0]
        X = np.delete(data_aggregate_after, -1, 0)
    
    lr.fit(X, y)
    knr.fit(X, y)
    predicted_lr = lr.predict(data_test[key])
    predicted_knr = knr.predict(data_test[key])
    print('True:', data_test[key][:,0])
    print('Linear Regression:', np.around(predicted_lr, 2))
    print('K Neighbors:', np.around(predicted_knr, 2))
    print('LR MSE:', mean_squared_error(predicted_lr, data_test[key][:,0]))
    print('KNR MSE:', mean_squared_error(predicted_knr, data_test[key][:,0]))
    print(' ')

sample0_period1
True: [ 52.  79.  94. 101. 122. 120.]
Linear Regression: [486.95 462.29 504.15 532.9  519.66 446.83]
K Neighbors: [200.   381.33 319.   961.17 415.33 940.33]
LR MSE: 159301.9175395098
KNR MSE: 277135.3935185185
 
sample0_period2
True: [3472. 3834. 2258. 1455. 1235. 1123.]
Linear Regression: [5402.06 5620.64 4021.69 3120.12 2993.65 2834.92]
K Neighbors: [ 8434.5  11071.67  3295.5   2096.5   2096.5   2096.5 ]
LR MSE: 3137326.184178239
KNR MSE: 13364673.004629627
 
sample0_period3
True: [59. 48. 94. 45. 77. 87.]
Linear Regression: [514.26 344.25 355.09 302.99 314.56 290.17]
K Neighbors: [671.    35.83  35.83  35.83 173.17  35.83]
LR MSE: 87910.63718106104
KNR MSE: 65004.24537037036
 
sample1_period1
True: [203. 180. 202. 294. 555. 846.]
Linear Regression: [ 591.83  567.96  576.28  657.88  824.76 1038.6 ]
K Neighbors: [ 216.33  289.33  519.33 1011.83  724.67  972.17]
LR MSE: 114009.27578329499
KNR MSE: 112136.91666666667
 
sample1_period2
True: [960. 995. 870. 960. 861. 903

In [36]:
""" Organize training data to combine 6 time windows. There will be 30 features (6 x original 5). """

# Time period 1 (before Feb 1, 8am)
data_aggregate_before_6x = np.zeros((data_aggregate_before.shape[0] // 6, 30))
y_before_6x = np.zeros(data_aggregate_before.shape[0] // 6)
try: # Catch IndexError, if number of windows is not divisible by 6
    for i in range(data_aggregate_before.shape[0]): # Iterate through all rows in original aggregated set
        data_aggregate_before_6x[i//6, 5*(i%6):5*(i%6)+5] = data_aggregate_before[i] # Update 5 elements 
        y_before_6x[i//6] += data_aggregate_before[i,0] # Get corresponding labels (# of tweets)
except IndexError:
    pass

# Time period 2
data_aggregate_during_6x = np.zeros((data_aggregate_during.shape[0] // 6, 30))
y_during_6x = np.zeros(data_aggregate_during.shape[0] // 6)
try:
    for i in range(data_aggregate_during.shape[0]):
        data_aggregate_during_6x[i//6, 5*(i%6):5*(i%6)+5] = data_aggregate_during[i]
        y_during_6x[i//6] += data_aggregate_during[i,0]
except IndexError:
    pass

# Time period 3
data_aggregate_after_6x = np.zeros((data_aggregate_after.shape[0] // 6, 30))
y_after_6x = np.zeros(data_aggregate_after.shape[0] // 6)
try:
    for i in range(data_aggregate_after.shape[0]):
        data_aggregate_after_6x[i//6, 5*(i%6):5*(i%6)+5] = data_aggregate_after[i]
        y_after_6x[i//6] += data_aggregate_after[i,0]
except IndexError:
    pass

In [37]:
# Reshape test set into corresponding shape
data_test_6x = {}
for key in filenames:
    data_test_6x[key] = np.reshape(data_test[key], (1,30))

In [38]:
""" Perform regression on the test set to predict the total number of tweets in the 6x window. """

for key in filenames:
    print(key)
    lr = LinearRegression()
    knr = KNeighborsRegressor(n_neighbors=6)
        
    if key[-1] == '1':
        lr.fit(data_aggregate_before_6x, y_before_6x)
        knr.fit(data_aggregate_before_6x, y_before_6x)
    elif key[-1] == '2':
        lr.fit(data_aggregate_during_6x, y_during_6x)
        knr.fit(data_aggregate_during_6x, y_during_6x)
    else:
        lr.fit(data_aggregate_after_6x, y_after_6x)
        knr.fit(data_aggregate_after_6x, y_after_6x)
    predicted_lr = lr.predict(data_test_6x[key])
    predicted_knr = knr.predict(data_test_6x[key])

    print('True:', np.sum(data_test[key][:,0]))
    print('Linear Regressor:', np.around(predicted_lr, 2))
    print('K Neighbors:', np.around(predicted_knr, 2))
    print(' ')

sample0_period1
True: 568.0
Linear Regressor: [568.]
K Neighbors: [2800.5]
 
sample0_period2
True: 13377.0
Linear Regressor: [13407.92]
K Neighbors: [21915.67]
 
sample0_period3
True: 410.0
Linear Regressor: [436.2]
K Neighbors: [3258.]
 
sample1_period1
True: 2280.0
Linear Regressor: [2280.]
K Neighbors: [3549.83]
 
sample1_period2
True: 5549.0
Linear Regressor: [5570.54]
K Neighbors: [20490.5]
 
sample1_period3
True: 305.0
Linear Regressor: [432.34]
K Neighbors: [3073.67]
 
sample2_period1
True: 953.0
Linear Regressor: [953.]
K Neighbors: [1810.33]
 
sample2_period2
True: 152.0
Linear Regressor: [187.62]
K Neighbors: [10814.17]
 
sample2_period3
True: 399.0
Linear Regressor: [473.28]
K Neighbors: [3409.]
 
