# Notebook for visualisation, running the timeline analysis

Create timeline analyzer object

In [1]:
from bispec_clustering_eval import BSCresults
from timeline_analysis import TimelineAnalyzer
import numpy as np
import plotnine
import os
import datetime
import pickle

bscres = TimelineAnalyzer('/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/collection_results_2021_05_04_16_22/bsc', '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/collection_results_2021_05_04_16_22/data')


Process the existing clustering data and determine best user and phrase clusters.

In [2]:
results = bscres.eval_nmi()
print('user index and cluster num: {}'.format(results[0][1:]))
print(results[1][1:])

user index and cluster num: (164, 174)
(167, 177)


Plot the evaluation plots for cluster values

In [3]:
bscres.plot(bins=100, verbose = False)

Exmining cluster words is available

In [4]:
x = bscres.examine_cluster_words(149, 106, show=0)
x

Unnamed: 0,hashtag,degree,topic_cluster
45,#balancetonporc,5188,106
47,#espagne,14,106
48,#femmes,112,106
49,#fi,55,106
50,#fn,69,106
...,...,...,...
69201,une mosquée de kaboul,10,106
69202,xiaomi mi mix,10,106
70593,la parole des,18,106
72575,#dénoncelestruies,69,106


Just for the purposes of this notebook, loading in a previously saved vocab object.

In [5]:
if os.path.isfile('/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/collection_results_2021_05_04_16_22/bsc/149_106_user_data.obj'):
    with open('/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/collection_results_2021_05_04_16_22/bsc/149_106_user_data.obj','rb') as f:
        bscres.df = pickle.load(f)
        df = bscres.df
else:
    df = bscres.get_user_cluster_data(149,6)

In [6]:
df

Unnamed: 0,author_id,tweet_id,text,created_at,referenced_tw_1,referenced_tw_2,referenced_tw_3,is_retweet,internal_retweet,vocab:#balancetonporc,...,vocab:sur la foune,vocab:sur les produits,vocab:sébastien le fol,vocab:sébastien le fol après,vocab:une mosquée de,vocab:une mosquée de kaboul,vocab:xiaomi mi mix,vocab:la parole des,vocab:#dénoncelestruies,vocab:libérer la parole
0,100496102,922570840696242177,RT @mathieugallard: En vue des européennes de ...,2017-10-23 21:10:03,922465252029157376,,,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,100496102,922570372121206784,RT @WWFFrance: 🐱 Journée mondiale du #léopard ...,2017-10-23 21:08:11,922375797377327104,,,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,100496102,922570075835473920,Une erreur de 10 milliards d'euro commise par ...,2017-10-23 21:07:01,,,,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,100496102,922567680527286272,RT @JCLarsonneur: Retrouvez la tribune cosigné...,2017-10-23 20:57:30,922563966273970176,,,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,100496102,922566329311170560,Harcèlement &amp; agression sexuelle : 40 femm...,2017-10-23 20:52:08,,,,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1373,99086843,917715109094490112,@lithadem Ah bon! J'aurais cru que relation sa...,2017-10-10 11:35:06,917672088055541762,,,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1374,99086843,917671291909496832,@lithadem A priori quand meme pour la majorité...,2017-10-10 08:41:00,917670310266245121,,,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1375,99086843,917660501886324736,@Sou_fak @Ostpolitik C moche visuellemt&amp;pa...,2017-10-10 07:58:07,917457083959840768,,,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1376,99086843,917658700004954112,RT @mistergrosbill: @Ostpolitik Pareil... just...,2017-10-10 07:50:58,917457565193265158,,,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
bscres.print_stats()

Number of tweets in df: 108564
Number of retweets in df: 73498
Percentage Retweets
Number of retweets of other users in the cluster: 0


## Plotting

### First, plotting the overall usage of vocab

In [None]:
bscres.plot_usage_vocab()

In [None]:
for vocab_entry in bscres.vocab_colnames[:1]:
    plot_savename = os.path.join(bscres.image_path, vocab_entry[1]+ ".png") 
    plot_data = bscres.df.loc[:,['created_at', vocab_entry[1]]]
    plot_data.loc[:,'created_at'] = plot_data.copy()['created_at'].apply(lambda x: x.date())
    plot_data = plot_data.groupby('created_at').sum()
    if plot_data[vocab_entry[1]].sum() == 0:
        continue
    plot_data = plot_data.reset_index()
    # plotnine.ggplot(plot_data) \
    #     + plotnine.aes(x="created_at", y=vocab_entry[1]) \
    #     + plotnine.geom_point(size=0.4) \
    #     + plotnine.geom_line() \
    #     + plotnine.theme(axis_text_x =  plotnine.element_text(rotation = 45, hjust=1))

In [None]:
plotnine.ggplot(plot_data) \
    + plotnine.aes(x="created_at", y=vocab_entry[1]) \
    + plotnine.geom_point(size=0.4) \
    + plotnine.geom_line(group=1) \
    + plotnine.theme(axis_text_x =  plotnine.element_text(rotation = 45, hjust=1))

### Then plotting the usage of vocab per user

In [None]:
bscres.plot_usage_per_user()

### Plotting Full Archive Search Activity Plots

In [8]:
bscres.plot_FAS_activity('/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/search_hashtags.txt')


Start Time: 2021-08-13 13:14:11.709259


processing FAS jsonl files:   8%|▊         | 3/36 [01:59<23:10, 42.13s/it]

In [None]:
import matplotlib.font_manager
flist = matplotlib.font_manager.findSystemFonts()
names = [matplotlib.font_manager.FontProperties(fname=fname).get_name() for fname in flist]
print(names)

In [None]:
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
font_manager.FontProperties(fname='/usr/share/fonts/opentype/NotoSerifTC-Regular.otf')
fig, ax = plt.subplots()
plt.rcParams['font.sans-serif'] = ['Noto Serif TC']
ax.plot(range(10))

prop = font_manager.FontProperties(fname='/usr/share/fonts/opentype/NotoSerifTC-Regular.otf') 
ax.set_title('妳好This is some random font', fontproperties=prop, size=32)

plt.show()

Trying to get the bloody fonts working for CJK characters

In [None]:
# from plotnine.options import set_option
# set_option('base_family',  'Rubik')

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
plt.rcParams['font.family'] = ['Noto Serif TC', 'Noto Sans KR', 'Noto Serif JP','Markazi Text','Rubik','Open Sans','STIX Two Text']
plt.rcParams['font.sans-serif'] = ['Noto Serif TC', 'Noto Sans KR', 'Noto Serif JP','Markazi Text','Rubik','Open Sans','STIX Two Text']
plt.rcParams['font.serif'] = ['Noto Serif TC', 'Noto Sans KR', 'Noto Serif JP','Markazi Text','Rubik','Open Sans','STIX Two Text']
font_manager.FontProperties(fname='/usr/share/fonts/opentype/NotoSerifJP-Regular.otf')
font_manager.FontProperties(fname='/usr/share/fonts/opentype/NotoSansKR-Regular.otf')
font_manager.FontProperties(fname='/usr/share/fonts/truetype/MarkaziText-VariableFont_wght.ttf')
font_manager.FontProperties(fname='/usr/share/fonts/truetype/Rubik-VariableFont_wght.ttf')
font_manager.FontProperties(fname='/usr/share/fonts/truetype/OpenSans-Regular.ttf')

bscres.FAS_activity_plot = plotnine.ggplot(bscres.FAS_activity_df_long, plotnine.aes(x = 'created_at', y = 'vocab:#', color = 'hashtag')) + \
    plotnine.geom_line(group=1) + \
    plotnine.scale_x_datetime(date_breaks = '1 month') + \
    plotnine.theme(
        text = plotnine.element_text(family=['Noto Sans KR', 'Noto Serif JP','STIX Two Text', 'Cairo']), 
        axis_text_x =  plotnine.element_text(rotation = 45, hjust=1)) + \
    plotnine.ggtitle('Activity Plot for Searched #MeToo Hashtags') + \
    plotnine.xlab('Date') + \
    plotnine.ylab('Volume of Activity')
plot_savename = os.path.join(bscres.image_path, 'FAS_activity.png')
bscres.FAS_activity_plot.save(
                plot_savename,
                width=15,
                height=10,
                dpi=600,
                verbose = False,
                limitsize=False
            )

In [None]:
plotnine.ggplot(bscres.FAS_activity_df_long, plotnine.aes(x = 'created_at', y = 'vocab:#', color = 'hashtag')) + \
    plotnine.geom_line(size = 0.3) + \
    plotnine.theme(axis_text_x =  plotnine.element_text(rotation = 45, hjust=1)) + \
    plotnine.scale_fill_brewer(type = 'diverging', palette="Spectral")
    # plotnine.scale_y_continuous(trans="log10") +\


In [None]:
bscres.plot_usage_vocab()