In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

import ujson as json
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from numpy import linalg as LA

# Question 9

In [2]:
filenames = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", 
             "tweets_#nfl.txt", "tweets_#patriots.txt", 
             "tweets_#sb49.txt", "tweets_#superbowl.txt"]
metrics = {}

In [3]:
def get_metrics(file):
    with open("ECE219_tweet_data/" + file) as f:
        lines = f.readlines()
        
    total_followers = 0
    total_retweets = 0
    lines_all = []
    max_date = 0
    min_date = float('inf')
    for i in lines:
        temp = json.loads(i)
        lines_all.append(temp)
        total_followers += temp["author"]["followers"]
        total_retweets += temp["metrics"]["citations"]["total"]
        if temp["citation_date"] < min_date:
            min_date = temp["citation_date"]
        if temp["citation_date"] > max_date:
            max_date = temp["citation_date"]
        
    #max_date = max(lines_all, key = lambda x:x["citation_date"])["citation_date"]
    #min_date = min(lines_all, key = lambda x:x["citation_date"])["citation_date"]
    avg_tweets_hour = len(lines_all) / (max_date - min_date)
    avg_tweets_hour *= 60 * 60 
    avg_followers_tweet = total_followers / len(lines_all)
    avg_retweets_tweet = total_retweets / len(lines_all)
    
    metrics[file] = [avg_tweets_hour, avg_followers_tweet, avg_retweets_tweet, max_date, min_date]
    return lines_all

def get_hours(file):
    lines = tweet_dict[file]
    #max_date = max(lines, key = lambda x:x["citation_date"])["citation_date"]
    #min_date = min(lines, key = lambda x:x["citation_date"])["citation_date"]
    max_date = metrics[file][-2]
    min_date = metrics[file][-1]
    diff = max_date - min_date
    hours = 60 * 60
    which_hour = [np.floor((i["citation_date"] - min_date) / hours) + 1 for i in lines]
    count = Counter(which_hour)
    df = pd.DataFrame.from_dict(count, orient = 'index')
    return df.reset_index().rename(columns = {0: "count"})

In [4]:
tweet_dict = {}

In [6]:
for i in filenames[:3]:
    print("Beginning", i)
    tweet_dict[i] = get_metrics(i)

Beginning tweets_#gohawks.txt
Beginning tweets_#gopatriots.txt
Beginning tweets_#nfl.txt


In [7]:
tweet_dict[3] = get_metrics(filenames[3])

In [None]:
tweet_dict[4] = get_metrics(filenames[4])

In [None]:
tweet_dict[5] = get_metrics(filenames[5])

In [None]:
df_metrics = pd.DataFrame(metrics)
df_metrics.index = ["avg_tweets_hour", "avg_follower_tweet", "avg_retweets_tweet"]
df_metrics = df_metrics.T
df_metrics

In [None]:
df_nfl = get_hours("tweets_#nfl.txt")
df_sb = get_hours("tweets_#superbowl.txt")
fig, ax = plt.subplots(1, 2, figsize = (15, 5))
ax[0].bar(range(len(df_nfl)), df_nfl["count"])
ax[0].set_xlabel("Hours")
ax[0].set_ylabel("Number of tweets")
ax[0].set_title("Tweets per Hour" + "#nfl.txt")

ax[1].bar(range(len(df_sb)), df_sb["count"])
ax[1].set_xlabel("Hours")
ax[1].set_ylabel("Number of tweets")
ax[1].set_title("Tweets per Hour" + "#sb.txt")
 
# Show the plot
plt.show()

# Design Our Own

In [None]:
tweet_data = []
for keys, vals in tweet_dict.items(): #different files 
    for j in vals: #vals in list of dictionaries, j is individual tweets
        print(keys, vals)
        tweet_f = {"tweet": j["tweet"]["text"], 
                   "hashtags": keys.split("_")[1][1:-4], #[k["text"] for k in j["tweet"]["entities"]["hashtags"]], 
                   "retweets": j["metrics"]["citations"]["total"], 
                   "time": j["citation_date"]}
        tweet_data.append(tweet_f)
        
tweet_df = pd.DataFrame(tweet_data) #entire data set to use

In [None]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
sia = SentimentIntensityAnalyzer()

def remove_hashtags(text):
    #Remove hashtags from text
    text = " ".join([word for word in text.split() if "#" not in word])
    return text

def preprocess_text(text):
    #text = text.lower()
    # Remove stop words
    text = " ".join([word for word in text.split() if word not in stop_words])
    # Lemmatize the text
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    # Get the sentiment scores
    sentiment_scores = sia.polarity_scores(text)
    # Add the sentiment scores to the text
    text = text + ' ' + str(sentiment_scores['neg']) + ' ' + str(sentiment_scores['neu']) + ' ' + str(sentiment_scores['pos'])
    return text

In [None]:
#Processed_text will contain lemmatized as well as negative, neural, and positive scores
tweet_df["no_hashtags"] = tweet_df["tweet"].apply(remove_hashtags)
tweet_df["processed_text_no_hash"] = tweet_df["no_hashtags"].apply(preprocess_text) 
tweet_df["processed_text_w_hash"] = tweet_df["tweet"].apply(preprocess_text) 

feat_cols_rt = "processed_text_w_hash"
feat_cols_ht = "process_text_no_hash"
targ_col_rt = "retweets"
targ_col_ht = "hashtags"
train_rt, test_rt = train_test_split(tweet_df[[feat_cols_rt] + [targ_col_rt]], test_size = 0.2)
train_ht, test_ht = train_test_split(tweet_df[[feat_cols_ht] + [targ_col_ht]], test_size = 0.2)

In [None]:
#count vectorization + tfidf transformer
pipe = Pipeline([("count", CountVectorizer(stop_words = "english", min_df = 3)), 
                 ("tfid", TfidfTransformer())]) 

train_rt_pipe = pipe.fit_transform(train_rt[feat_cols_rt]) #fit pipeline and transform with train set
test_rt_pipe = pipe.transform(test_rt[feat_cols_rt]) #only transform test set

train_ht_pipe = pipe.fit_transform(train_ht[feat_cols_ht]) #fit pipeline and transform with train set
test_ht_pipe = pipe.transform(test_ht[feat_cols_ht]) #only transform test set

#Latent Semantic Indexing 
#retweets
lsi_mod = TruncatedSVD(n_components = 1000, random_state = 0)
train_rt_lsi = lsi_mod.fit_transform(train_rt_pipe)
test_rt_lsi = lsi_mod.transform(test_rt_pipe)
lsi_rt_mse = LA.norm(train_rt_pipe - np.dot(train_rt_lsi, lsi_mod.components_)) ** 2

#hashtags
train_ht_lsi = lsi_mod.fit_transform(train_ht_pipe)
test_ht_lsi = lsi_mod.transform(test_ht_pipe)
lsi_ht_mse = LA.norm(train_ht_pipe - np.dot(train_ht_lsi, lsi_mod.components_)) ** 2

In [None]:
print("Mean-Sqaured Error for the Retweet LSI is", lsi_rt_mse)
print("Mean-Sqaured Error for the Hashtag LSI is", lsi_ht_mse)

# Neural Net Predicting Number of Retweets

In [None]:
#Predicting Retweets Using MLP

mlp = MLPRegressor(max_iter = 1000, verbose = False)
hidden_layer_sizes = [50, 75, 100]
layer_sizes = []
for i in range(1, len(hidden_layer_sizes) + 1):
    for subset in itertools.combinations_with_replacement(hidden_layer_sizes, i):
        layer_sizes += [list(subset)]
        
#layer_sizes = np.random.choice(layer_sizes, size = 15, replace = False)

param = {"hidden_layer_sizes": layer_sizes, 
         "activation": ["relu", "identity", "logistic", "tanh"], 
         "alpha": [10.0 ** x for x in np.arange(-3 , 3)]}

X = train_rt_lsi
y = train_rt[targ_col_rt]
grid_mlp_twt = GridSearchCV(estimator = mlp, 
                            param_grid = param, 
                            cv = 10, 
                            scoring = "neg_mean_squared_error", 
                            n_jobs = -1)
grid_mlp_twt.fit(X, y)
df_mlp_twt = pd.DataFrame({"Best MLP": [grid_mlp_twt.best_estimator_, 
                                    grid_mlp_twt.best_params_, 
                                    np.sqrt(-1 * grid_mlp_twt.best_score_)]})
df_mlp_twt.index = ["Best Estimator", "Best Parameters", "RMSE"]
df_mlp_twt

In [None]:
best_mlp_twt = grid_mlp_twt.best_estimator_
best_mlp_twt.fit(X, y)
y_pred = grid_mlp_twt.predict(test_rt_lsi)
mlp_rmse_rt = mean_squared_error(test_rt[targ_col_rt], y_pred, squared = False)
print("The best MLP Model is", best_mlp_twt)
print("With the test set, the RMSE is", mlp_rmse_rt)

# Predicting Hashtags using Random Forest

In [None]:
#Predicting Hashtags Using Random Forest
rf_twt = RandomForestClassifier(oob_score = True)
param = {"max_depth": [1, 2, 3, 4, 5], 
         "n_estimators": [100, 200, 500, 1000, 2000]}
grid_rf_twt = GridSearchCV(estimator = rf, 
                           param_grid = param, 
                           cv = 10, 
                           scoring = "accuracy", 
                           n_jobs = -1)
X = train_ht_lsi
y = train_ht[targ_col_ht]
df_rf_twt = pd.DataFrame({"Best Random Forest": [grid_rf.best_estimator_, 
                                                 grid_rf.best_params_, 
                                                 grid_rf.best_score_, 
                                                 grid_rf.best_estimator_.oob_score_]})
df_rf_twt.index = ["Best Estimator", "Best Parameters", "Accuracy", "OOB Score"]
df_rf_twt

In [None]:
best_rf_twt = grid_rf_twt.best_estimator_
best_rf_twt.fit(X, y)
y_pred = grid_rf_twt.predict(test_ht_lsi)
rf_acc_twt = accuracy_score(test[targ_col_ht], y_pred)
print("The best Random Forest Classifier Model is", best_rf_twt)
print("The OOB score is", grid_rf.best_estimator_.oob_score_)
print("The test accuracyis", rf_acc_twt)