In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec

### Load tokenized tweets

In [3]:
with open('../data/tweets_tokenized.pickle', 'rb') as f:
    tweets_tokenized = pickle.load(f)

### Load word2vec model

In [4]:
w2v_model = Word2Vec.load("../models/w2v_skipgram_w4.model")

### Embedd tweets

- First define a function that adds the individual words in each tweet together to form a vector representation of each tweet.

In [5]:
# This function embedds tweets based on the sum of the vector representations of each word in the tweet
def embed_w2v_sum(tokens, w2v):
    
    # Get the index of the word vectors for each token in a tweet
    idxs = [w2v.wv.vocab.get(t) for t in tokens]
    idxs = [t.index for t in idxs if t]
    
    # N is the number of tokens in tweet
    N = w2v.wv.vectors.shape[1]
    if len(idxs) < 1:
        return np.zeros(N)
    
    # A tweet is represented by the sum of the vectors it contains
    a = np.sum(w2v.wv.vectors[idxs, :], axis=0) 
    
    # Standardize the whole tweet by its norm
    a /= np.linalg.norm(a)
    return a


In [6]:
# Use the function to embedd the tweets
tweets_embedded = np.array([embed_w2v_sum(t, w2v_model) for t in tweets_tokenized])

In [None]:
# Loading the full tweet data
with open('../data/tweets_preprocessed.pickle', 'rb') as f:
    tweet_df = pickle.load(f)

In [None]:
# Remove some unwanted stuff
cols_to_keep = ["userid", "user_screen_name","user_display_name","follower_count",
                "tweet_text","is_retweet","retweet_count","hashtags"]

tweet_df = tweet_df.loc[:,cols_to_keep].copy()

In [None]:
# Add embedded tweets as new column in data frame
cols = [str(i) + "_dim" for i in range(0,tweets_embedded.shape[1])]

# Embedding df
embedding_df = pd.DataFrame(tweets_embedded,columns = cols, index= tweet_df.index)

In [None]:
del tweets_embedded

import garba

In [None]:
# Join and save
tweets_final = tweet_df.join(embedding_df)

# Pickle - Tweets
with open('../data/tweets_final.pickle', 'wb') as f:
    pickle.dump(tweets_final, f)



## Visualize and Cluster Embedded Tweets

Let's start with taking some very interesting and/or offensive tweets and finding the most similar as well as dissimilar tweets to them.

We should also work on a subset, so let's take tweets with a higher number of retweets.

Let's visualize some tweets first. We pick out two important accounts: KaniJJackson and Pamela_Moore13

In [None]:
from utils.comparison import embedd_tSNE

In [None]:
# Get the corresponding subsets of the tweet df and embedded tweets, we will work with those now
subset = tweets_final[(tweets_final.user_screen_name == "KaniJJackson") | (tweets_final.user_screen_name == "Pamela_Moore13")].copy()

Let's try to visualize this subset, maybe we already get a separation.

In [None]:
# Get columns of embeddings
X = subset.loc[:,subset.columns[-100:]].values

In [None]:
# Embedd
d = embedd_tSNE(X,pca_n =10, metric='cosine')

In [None]:
subset["x"] = d[:,0]
subset["y"] = d[:,1]

In [None]:
sns.scatterplot(data=subset, x= "x", y="y", hue = "user_screen_name")

Well this is really anything but a separation. On the other hand, project these tweets in two dimensions is really hard. t-SNE is really not supposed to be used for clustering, rather for visualization. Further, I think t-SNE picks up the hashtags (as intended to) which can be shared across users. Let's verify this quickly:

In [None]:
# Get subsets of tweets by hashtag
subset = tweets_final[(tweets_final.hashtags == "[MAGA]") | (tweets_final.hashtags == "[BlackLivesMatter]")].copy()

# Get columns of embeddings
X = subset.loc[:,subset.columns[-100:]].values

# Embedd
d = embedd_tSNE(X,pca_n =10, metric='cosine')

subset["x"] = d[:,0]
subset["y"] = d[:,1]

# Visualize
sns.scatterplot(data=subset, x= "x", y="y", hue = "hashtags")

Interesting, MAGA seems to be one single cluster, whereas BlackLivesMatter consists of many groups. Maybe we can already see polarization inside the BlackLivesMatter topic. It could be that the Russians tried to shape the discussion about race by pushing two sides.

How to continue:

- Cluster all tweets, see whether there are meaningful clusters
- If yes, explore these
- If not, explore single topics, try to find clusters within topics, if there are any, explore these.
