## 1. load data and count

In [None]:
import json
from collections import Counter
from itertools import combinations
from pathlib import Path

from tqdm import tqdm

from joblib import Parallel, delayed

In [None]:
def read_data(f):
    first_tweet_hgm = 1121716470856155136
    tweets = Path(f).read_text().split('\n')
    tweets = [t for t in tweets if t != '']
    if len(tweets) == 0:
        return
    tweets = [json.loads(t) for t in tweets]
    rt = [t['retweeted_status']['user']['screen_name'].lower() for t in tweets if 'retweeted_status' in t and t['id'] > first_tweet_hgm]
    return rt

In [None]:
rts = Parallel(n_jobs=8)(delayed(read_data)(f) for f in tqdm(list(Path('/mnt/data/datasets/twitter/tweets-maassen/').glob('*.json'))))

In [None]:
c = Counter()
for names in tqdm(rts):
    if names is None or 'hgmaassen' not in names:
        continue
    c.update(set(names))

In [None]:
len(c.keys())

In [None]:
c.most_common(21)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame([{'name': x[0], 'value': x[1] / c['hgmaassen']} for x in c.most_common(101)[1:]])

In [None]:
df.to_csv('data.csv', )

## 2. create vis with co-ocurrence matrix, PPMI and PCA

In [None]:
import math

import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [None]:
embd_n = 1001

In [None]:
idx2token = [x[0] for x in list(c.most_common(embd_n)) if x[0] != 'hgmaassen']
token2idx = {k: v for v, k in enumerate(idx2token)}
n = len(idx2token)

In [None]:
xs, ys, data = [], [], []
for x in tqdm(rts):
    if x is None or 'hgmaassen' not in x:
        continue
    s = set(x)
    s = [t for t in s if t in token2idx]
    for (c1, c2) in combinations(s, 2):
        c1 = token2idx[c1]
        c2 = token2idx[c2]
        xs.append(c1)
        xs.append(c2)
        ys.append(c2)
        ys.append(c1)
        data.append(1/len(s))
        data.append(1/len(s))

In [None]:
num_yes = 0
num_no = 0

In [None]:
for x in tqdm(rts):
    if x is None:
        continue
    if 'hgmaassen' in x:
        num_yes += 1
    else:
        num_no +=1

In [None]:
num_no

In [None]:
num_yes

In [None]:
num_no / len(rts)

In [None]:
m = coo_matrix((data, (xs, ys)), (n, n), dtype=np.float32)

In [None]:
m = m.tocsr()

In [None]:
def calc_pmi(counts, cds):
    """
    Calculates e^PMI; PMI without the log().
    """

    sum_w = np.array(counts.sum(axis=1))[:, 0]
    sum_c = np.array(counts.sum(axis=0))[0, :]
    if cds != 1:
        sum_c = sum_c ** cds
    sum_total = sum_c.sum()
    sum_w = np.reciprocal(sum_w)
    sum_c = np.reciprocal(sum_c)

    pmi = csr_matrix(counts)
    pmi = multiply_by_rows(pmi, sum_w)
    pmi = multiply_by_columns(pmi, sum_c)
    pmi = pmi * sum_total
    return pmi


def multiply_by_rows(matrix, row_coefs):
    normalizer = dok_matrix((len(row_coefs), len(row_coefs)))
    normalizer.setdiag(row_coefs)
    return normalizer.tocsr().dot(matrix)


def multiply_by_columns(matrix, col_coefs):
    normalizer = dok_matrix((len(col_coefs), len(col_coefs)))
    normalizer.setdiag(col_coefs)
    return matrix.dot(normalizer.tocsr())

In [None]:
mm = calc_pmi(m, 0.75)

In [None]:
res = MinMaxScaler().fit_transform(mm.todense())

In [None]:
vis_n = 100

In [None]:
res_vis = res[:vis_n, :]

In [None]:
res_vis.shape

In [None]:
res_vis = PCA(n_components=2).fit_transform(res_vis)
res_vis = MinMaxScaler().fit_transform(res_vis)

In [None]:
from adjustText import adjust_text

In [None]:
from matplotlib import rcParams
rcParams['font.family'] = 'lato'

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))

fig.patch.set_visible(False)
ax.axis('off')

sc = ax.scatter(res_vis[:, 0], res_vis[:, 1], color='black')

texts = [plt.text(res_vis[i][0] + 0.015 * 0, res_vis[i][1] - 0.009 * 0, idx2token[i], weight='regular', size='14') for i in range(vis_n)]
adjust_text(texts, weight='regular', size='14')

In [None]:
fig.savefig("100.svg")