In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from bokeh.models import ColumnDataSource, Legend, BoxAnnotation, Rect, LegendItem, Label,Span, Arrow, NormalHead, OpenHead, VeeHead
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.layouts import gridplot, layout, Spacer
import os

Defining functions

In [2]:
def CovidRelated(title_string):
    covid_tags = ['coronavirus', 'covid', 'quarantine','c19','c-19',
                  'pandemic', 'virus', 'epidemic', 'lockdown','sars', 'CoV-2']
    covid_in_title = [x in title_string for x in covid_tags]
    return int(sum(covid_in_title) > 0)

def get_list_of_subreddits(topic, n=3):
    
    topic_path = ('/home/jpre/Documents/DTU/SocialDataViz/Project/lists/List_of_{0}_subreddits.txt').format(topic)
    f = open(topic_path, 'r')
    topic_list = f.readlines()
    f.close()
    topic_list = [x[:-1].split(' ') for x in topic_list]
    topic_list = [item for sublist in topic_list for item in sublist]
    topic_list = [x[n:].lower() for x in topic_list]
    
    return topic_list


def GetData(w,sr,folder='weeks', year = '2020', N = None, categories = None):
    
    column_names = ['time_utc', 'epoch_utc', 'time_local', 'epoch_local', 'id', 'score', 'subreddit', 'ncomments']

    t_i = str(w)
    subreddit = sr
    parent_folder = '/home/jpre/Documents/DTU/SocialDataViz/Project/subreddits/'
    parent_folder = '/home/jpre/Documents/DTU/SocialDataViz/Project/{0}/subreddits/'.format(year)
    titles_file_name = parent_folder + subreddit +'/' + folder +'/'+ t_i +'/titles.txt'
    df_file_name = parent_folder + subreddit + '/' + folder +'/'+ t_i +'/df.txt'

    with open(titles_file_name,'r') as f:
        titles = f.readlines()

    data = pd.read_csv(df_file_name, sep="\t", header=None)

    data = data.rename(columns = dict(zip(range(8) , column_names)))
    data['title'] = [x.lower() for x in titles]

    data['time_utc'] = pd.to_datetime(data['time_utc'], infer_datetime_format=True)
    data['date_char'] = data['time_utc'].map(lambda x: str(x.day) + '/' + str(x.month))
    data = data.rename(columns = dict(zip(range(8) , column_names)))
    
    data['time_utc'] = pd.to_datetime(data['time_utc'], infer_datetime_format=True)
    data['time_local'] = pd.to_datetime(data['time_local'], infer_datetime_format=True)
    data['date_char'] = data['time_utc'].map(lambda x: str(x.day) + '/' + str(x.month))
    data['Month'] = data['time_utc'].map(lambda x: x.month)
    data['Year'] = data['time_utc'].map(lambda x: x.year)
    data['Hour'] = data['time_utc'].map(lambda x: x.hour)
    data['Day'] = data['time_utc'].map(lambda x: x.day)
    data['WeekDay'] = data['time_utc'].map(lambda x: x.dayofweek)
    data['WeekHour'] = (24*(data.WeekDay) + data.Hour)

    data['subreddit'] = [x.lower() for x in data['subreddit'].tolist()]

    # counting the days since the begining of the year
    ini_year = pd.to_datetime('2020-01-01 00:00:00').timestamp()
    daycount = [int((x-ini_year)/(24*60*60)) for x in data.epoch_utc.tolist()]
    data['DayCount'] = daycount
    
    if categories != None:
        data.loc[~data['subreddit'].isin(categories), 'subreddit']= 'other'

    data = data.sort_values(by=['score'], ascending=False)
    if N != None:
        # Trimming to the main 400
        data = data.head(N)
        
    return(data)

def GetDF(sr, folder = 'days', year = '2020'):

    import os

    the_folder = '/home/jpre/Documents/DTU/SocialDataViz/Project/{0}/subreddits/{1}/{2}'.format(year,sr,folder)
    file_names = [str(file) + '.txt' for file in sorted([int(k) for k in os.listdir(the_folder)])]

    # read first
    # create week column
    # read the rest and concatenate
    data_i = GetData(0,sr=sr, folder = folder)
    data_i['F'] = data_i.title.apply(CovidRelated)
    data_i['timestep'] = 0
    n_posts = data_i.shape[0]
    n_corona_posts = data_i['F'].sum()

    TS = [0]
    FF = [n_corona_posts]
    K = [n_posts]
    Fraction = [n_corona_posts/n_posts]
    
    for ts in range(1,len(file_names)):
        data_i = GetData(ts,sr=sr, folder = folder)
        data_i['F'] = data_i.title.apply(CovidRelated)
        data_i['timestep'] = ts
        n_posts = data_i.shape[0]
        n_corona_posts = data_i['F'].sum()

        TS.append(ts)
        FF.append(n_corona_posts)
        K.append(n_posts)
        Fraction.append(n_corona_posts/n_posts)

#         data = pd.concat([data,data_i])

    df_ts = pd.DataFrame(list(zip(TS,FF,K,Fraction)), columns=['timestep', 'F','K', 'f'])
    df_ts['zero'] = 0
    
    return df_ts



def CreateOneTiledPlot(day, folder = 'days', sr = 'all'):
    
    N = 100

    # data_i = GetData(day, 'all', folder='days', categories = dictionaries, N = N)

    data_i = GetData(day, sr, folder=folder, categories = dictionaries)
    data_i = data_i.sample(N)
    
    
    data_i['F'] = data_i.title.apply(CovidRelated)
    data_i.loc[data_i['subreddit'] == 'Coronavirus', 'F'] = 1
    data_i.loc[data_i['subreddit'] == 'other', 'title'] = 'NA'

    all_categories = []
    for c in data_i.subreddit.tolist():
        new_c = c.lower()
        if c.lower() in dictionaries.keys():
            new_c = dictionaries[new_c]
        else:
            new_c = 'other'
        all_categories.append(new_c)

    Titles = data_i.title.tolist()
    date_i = data_i.date_char.tolist()
    time_of_post = data_i.time_utc.tolist()

    df = pd.DataFrame(list(zip(all_categories,Titles, time_of_post)), columns = ['cat','title', 'timepost'])
    df['col'] = [color_dict[cat] for cat in df.cat.tolist()]
    df = df.sort_values(by=['cat'])
    
    df.head()
    
    
    rngsize = int(np.sqrt(N))
    axis_range = [str(n) for n in range(rngsize)]
    X, Y = np.meshgrid(range(rngsize), range(rngsize))
    x_ = [item+0.5 for sublist in X for item in sublist]
    y_ = [item+0.5 for sublist in Y for item in sublist]
    
    df['x'] = x_
    df['y'] = y_
    
    # # NO TITLE
    edgesize = 200
    p_i = figure(x_range=axis_range, y_range=axis_range,
               x_axis_location="above", plot_width=edgesize, plot_height=edgesize,
               tools=TOOLS, toolbar_location='below',tooltips=[('title', '@title')])

    p_i.axis.visible = False
    p_i.rect(x="x", y="y", width=1, height=1,
           source=df,
           fill_color='col',#{'field': 'cat', 'transform': mapper},
           line_color='white', line_width = 1.2)
    
    return(p_i)

## Infection of r/all

First, I read all days and I create a dataframe with the daily activity. Then, I group the subreddits in categories based on lists that I downloaded from reddit (list of subreddits)

In [52]:
lifestyle_list = get_list_of_subreddits('lifestyle', 3)
politics_list = get_list_of_subreddits('politics', 3)
entertainment_list = get_list_of_subreddits('entertainment', 3)
discussion_list = get_list_of_subreddits('discussion', 3)
pan_list = get_list_of_subreddits('pan', 2)
other_list = get_list_of_subreddits('other', 3)
meme_list = get_list_of_subreddits('meme', 3)

soccer_list = get_list_of_subreddits('soccer', 3)
nhl_list = get_list_of_subreddits('nhl', 2)
nfl_list = get_list_of_subreddits('nfl', 2)
nba_list = get_list_of_subreddits('nba', 2)
other_sports_list = get_list_of_subreddits('sport', 3)

educational_list = get_list_of_subreddits('educational', 3)

big_sports_list = nhl_list + nba_list + nfl_list + soccer_list + other_sports_list

#nsfw_list = get_list_of_subreddits('NSFW', 3)
# dict_nsfw = dict.fromkeys(nsfw_list, 'nsfw')
#dict_gaming = dict.fromkeys(gaming_list, 'gaming')
#dict_food = dict.fromkeys(food_list, 'food')
#dict_sports = dict.fromkeys(big_sports_list, 'sports')
#dict_youtube = dict.fromkeys(youtube_list, 'youtube')
# dict_discussion = dict.fromkeys(discussion_list, 'discussion')
#dict_lifestyle = dict.fromkeys(lifestyle_list, 'lifestyle')
#dict_other = dict.fromkeys(other_list, 'other')

dict_pan = dict.fromkeys(pan_list, 'pan')

dict_edu = dict.fromkeys(educational_list, 'educational')
dict_corona = {'coronavirus':'coronavirus'}
dict_politics = dict.fromkeys(politics_list, 'news/politics')
dict_entertainment = dict.fromkeys(entertainment_list + big_sports_list, 'entertainment')
dict_humor = dict.fromkeys(meme_list, 'humor')
dict_other = {'other':'other'}

dictionaries = {**dict_corona, **dict_politics, **dict_entertainment, **dict_edu, **dict_other}

In [53]:
folder = '/home/jpre/Documents/DTU/SocialDataViz/Project/2020/subreddits/all/days/'
column_names = ['time_utc', 'epoch_utc', 'time_local', 'epoch_local', 'id', 'score', 'subreddit', 'ncomments']

ndays = len(os.listdir(folder))

List_of_DFs = [GetData(w,'all',folder='days', categories= dictionaries) for w in range(106)]   

data_all = pd.concat(List_of_DFs)

data5_by_subreddit = pd.pivot_table(data_all, index=['DayCount'], aggfunc='size', columns=['subreddit']).fillna(0)
totals = data5_by_subreddit.sum(axis=1)
data5_by_subreddit = data5_by_subreddit.divide(totals, axis=0) # finding the probabilities

data5_groups = data5_by_subreddit.groupby(dictionaries, axis = 1).sum()
data5_groups['zero'] = 0
data5_groups.head()

Unnamed: 0_level_0,coronavirus,educational,entertainment,news/politics,other,zero
DayCount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0,0.017857,0.155612,0.010204,0.816327,0
1,0.0,0.027708,0.18136,0.020151,0.770781,0
2,0.0,0.055,0.1075,0.0525,0.785,0
3,0.0,0.044828,0.093103,0.051724,0.810345,0
4,0.0,0.017766,0.180203,0.032995,0.769036,0


Now, I download data of contagions to link it to reddit events
http://www.euro.who.int/en/health-topics/health-emergencies/coronavirus-covid-19/news/news/2020/01/2019-ncov-outbreak-first-cases-confirmed-in-europe

In [54]:
data_world = pd.read_csv('./covid_data')
# eurodates = str(data_world['day']) + '/' + str(data_world['month']) + '/' + str(data_world['year'])
data_world['date'] = pd.to_datetime(data_world['dateRep'],  format='%d/%m/%Y')

data_europe = data_world[data_world.continentExp.isin(['Europe'])]
# data_europe = data_world[data_world.countriesAndTerritories.isin(['Italy'])]
data_euro_group = data_europe.groupby(['date']).agg({'cases':'sum','deaths':'sum'})
data_euro_group = data_euro_group.iloc[1:107]

data_euro_group['DayCount'] = range(106)
data_euro_group = data_euro_group.set_index('DayCount')
data_euro_group = data_euro_group.reset_index()
data_euro_group.head()

data_euro_group['deathcum'] = data_euro_group.deaths.cumsum()
data_euro_group['casescum'] = data_euro_group.cases.cumsum()

data_euro_group['deathcum'] = data_euro_group.deathcum.div(max(data_euro_group.deathcum))
data_euro_group['casescum'] = data_euro_group.casescum.div(max(data_euro_group.casescum))

data_euro_group.head()


Events = {'first_EU_case' : 24,'is_pandemic' : 69, 'boris_in_hospital' : 95}
# for event in Events:
#     data_euro_group[event] = [int(x>=Events[event]) for x in range(data_euro_group.shape[0])]
df_Events = pd.DataFrame({'event' : list(Events.keys()),
                          'day' : list(Events.values()),
                          'description' : ['First confirmed case \nin Europe (France)', # January 24
                                           #'WHO names the virus COVID-19', # February 11
                                           'WHO declares COVID-19 a pandemic', # March 10
                                           #'US bans flights from foreign countries', # March 11
                                           #'Lockdown starts in Denmark', # March 13
                                           'Boris Johnson is admitted\nto the Hospital']})
df_Events['y_text'] = [1,1, 1]
df_Events['y_tri_inv'] = 1
df_Events['y_tri'] = [1.5,1.5, 1.5]
df_Events['y_line'] = 1
df_Events

Unnamed: 0,event,day,description,y_text,y_tri_inv,y_tri,y_line
0,first_EU_case,24,First confirmed case \nin Europe (France),1,1,1.5,1
1,is_pandemic,69,WHO declares COVID-19 a pandemic,1,1,1.5,1
2,boris_in_hospital,95,Boris Johnson is admitted\nto the Hospital,1,1,1.5,1


Creating events dataframe

In [58]:
from bokeh.models import BoxAnnotation, Label, InvertedTriangle, Text, Triangle


TOOLS = "hover,save,reset"
source = ColumnDataSource(data=data5_groups)


TOOLTIPS = [('',"@description")]
width_main = 1250

p = figure(plot_width=width_main, plot_height=340, title = None, x_range=(0,105), y_range=(0,1), tools = "")

# https://javier.xyz/cohesive-colors/
days = list(range(106))
mc = ['hotpink', 'slateblue', 'gold', 'seagreen','indianred', 'lightgrey']
mc = ['#F93822', '#BD7F37', '#FAD0C9', '#76528B','#D64161', '#C7D3D4']
mc = ['#F93822', '#ABD1C9','#DFDCE5','#DBB04A', '#97B3D0', '#C7D3D4']
mc = ['#F93822', '#97B3D0','#ABD1C9', '#DBB04A', '#ffd1d7', '#e6eeee']
columns = ['coronavirus','entertainment', 'humor','news/politics', 'educational', 'other']


#CBCE91FF) and Purple Sapphire (#76528BFF
color_dict = dict(zip(columns, mc))

# r = p.varea_stack(columns, x='DayCount', color=mc, source=source, alpha = 1)
agg_days = 1
data_by_week = data5_groups
data_by_week['week'] = data_by_week.reset_index().DayCount.floordiv(agg_days)
data_by_week = data_by_week.groupby(['week']).mean().reset_index()
data_by_week['week'] = data_by_week['week']*agg_days
r = p.varea_stack(columns, x='week', color=mc, source=ColumnDataSource(data=data_by_week), alpha = 1)

 # Adjusting plot parameters
p.grid.visible = False
p.outline_line_color = "white"
p.background_fill_color = color_dict['other']
p.background_fill_alpha = 1


source_eurodata = ColumnDataSource(data=data_euro_group)
c_event = 'black'
w_event = 1.5
dash_event = 'dashed'

for event in Events:
    y_event = df_Events[df_Events.event == event].y_line.values[0]
    day_event = df_Events[df_Events.event == event].day.values[0]
    p.line(x=[day_event, day_event], y = [0,y_event],
           line_color = c_event, line_width = w_event, line_dash = dash_event)


 # Adding event annotations
InvTri = InvertedTriangle(x="day", y="y_tri_inv", size=14, line_color=None, fill_color='black')
p.add_glyph(ColumnDataSource(data=df_Events), InvTri)  

event_font_size = '12pt'
event_line1 = 1.12
event_line2 = 1.04

# Adding event labels
event1 = Label(x=23, y=event_line1, text='First confirmed COVID-19', render_mode='css', text_color = 'black', text_align = 'left',
               text_font_size = event_font_size)
event11 = Label(x=23, y=event_line2, text='case in Europe', render_mode='css', text_color = 'black', text_align = 'left',
               text_font_size = event_font_size)
p.add_layout(event1)
p.add_layout(event11)

event2 = Label(x=68, y=event_line1, text='WHO declares', render_mode='css', text_color = 'black', text_align = 'left',
               text_font_size = event_font_size)
event21 = Label(x=68, y=event_line2, text='COVID-19 is a pandemic', render_mode='css', text_color = 'black', text_align = 'left',
               text_font_size = event_font_size)
p.add_layout(event2)
p.add_layout(event21)

event3 = Label(x=94, y=event_line1, text='UK PM admitted', render_mode='css', text_color = 'black', text_align = 'left',
               text_font_size = event_font_size)
event31 = Label(x=94, y=event_line2, text='to the hospital', render_mode='css', text_color = 'black', text_align = 'left',
               text_font_size = event_font_size)
p.add_layout(event3)
p.add_layout(event31)

# Creating 2020 arrow
start2020 = Label(x=5.5, y=0.85, text='2020', render_mode='css',
                    text_color = 'black', text_align = 'right',
                  text_font_style = 'bold', text_font_size = '18px')
p.add_layout(start2020)


arrow2020 = Arrow(end=NormalHead(size = 12),
                   x_start=5.5, y_start=0.90, x_end=5.5+2, y_end=0.90, line_width=0)
p.add_layout(arrow2020)

# Creating legend
legend_elements = []
for name,col in zip(columns,mc):
    r = p.square(x=[-1],y=[1], fill_color = col, line_color = None)
    legend_elements.append((name, [r]))

# Configuring axis
p.xaxis.visible = True
p.yaxis.visible = True
p.yaxis.axis_line_width = 2
p.yaxis.major_tick_line_width = 2
p.yaxis.axis_label = None
p.yaxis.ticker = [0,1]
p.yaxis.major_label_text_font_size = '16pt'
p.yaxis.axis_label_text_color = 'black'

x_ticks = df_Events.day.tolist()
p.xaxis.ticker = x_ticks
p.xaxis.axis_line_width = 2
p.xaxis.major_tick_line_width = 2
p.xaxis.major_label_overrides = dict(zip(x_ticks, ['24/01/20', '13/03/20', '06/04/20']))
p.xaxis.major_label_text_font_size = '16pt'
p.xaxis.major_label_text_baseline = 'bottom'
p.xaxis.major_label_standoff = 10
p.xaxis.major_label_text_color = 'black'

# legend = Legend(items=legend_elements, location="bottom_left")
# p.add_layout(legend, 'right')
# p.legend.orientation = "vertical"
# p.legend.background_fill_color = None
# p.legend.background_fill_alpha = 0
# p.legend.border_line_width = 0
# p.legend.label_text_font_size = '12pt'
# p.legend.glyph_width = 30
# p.legend.glyph_height = 30
# p.legend.spacing = 0
# p.legend.label_standoff = 0
# p.legend.title = 'Categories'
# p.legend.title_text_font_style = 'bold'
# p.legend.title_text_font_size = '14pt'

title_axis_y = 'Proportion of posts'
title_y1 = Label(x=-15.5, y=0.45, text=title_axis_y,
                render_mode='css', text_color = 'black', text_align = 'left',
               text_font_size = '18pt', angle = 90, angle_units = 'deg')
p.add_layout(title_y1)
title_axis_y2 = 'in reddit main page'
title_y2 = Label(x=-13.5, y=0.45, text=title_axis_y2,
                render_mode='css', text_color = 'black', text_align = 'left',
               text_font_size = '18pt', angle = 90, angle_units = 'deg')
p.add_layout(title_y2)


p.min_border_left = 80
p.min_border_top = 50
p.min_border_bottom = 20

## dummy plot with arrows
p_dummy = figure(plot_width=1250, plot_height=120, title = None, x_range=(0,105), y_range=(0,1), tools = "")
p_dummy.axis.visible = False
p_dummy.grid.visible = False
p_dummy.outline_line_color = None

x1 = 24
x2 = 69
x3 = 95

arrow1 = Arrow(end=NormalHead(size = 12),
                   x_start=x1, y_start=1, x_end=x1, y_end=0.1, line_width=2, line_color = 'black')
p_dummy.add_layout(arrow1)
p_dummy.line(x=[x1,x1], y = [1,0.1], line_width=2, line_color = 'black')

arrow2 = Arrow(end=NormalHead(size = 12),
                   x_start=x2, y_start=1, x_end=x2, y_end=0.1, line_width=2)
p_dummy.add_layout(arrow2)
p_dummy.line(x=[x2,x2], y = [1,0.1], line_width=2, line_color = 'black')

arrow3 = Arrow(end=NormalHead(size = 12),
                   x_start=x3, y_start=1, x_end=x3, y_end=0.1, line_width=2)
p_dummy.add_layout(arrow3)
p_dummy.line(x=[x3,x3], y = [1,0.1], line_width=2, line_color = 'black')

explanation = 'We could extract a random sample of\n100 posts from any of these days\nand get something like this...'
text1 = Text(x='x', y='y', text = 'text', text_font_size = '14pt', text_font = 'Helvetica')
p_dummy.add_glyph(ColumnDataSource(data=pd.DataFrame({'x':[(x1+x2)/3], 'y':[0.24], 'text':[explanation]})), text1)

p1 = CreateOneTiledPlot(24)
p2 = CreateOneTiledPlot(70)
p3 = CreateOneTiledPlot(96)

space1 = Spacer(width=280, height=1)
space2 = Spacer(width=290, height=1)
space3 = Spacer(width=85, height=1)

first_plot = layout([[gridplot([p,p_dummy], ncols=1)],
             [space1, p1, space2, p2, space3, p3]])
show(first_plot)

# show(gridplot([p,p_dummy], ncols=1))

In [18]:
from bokeh.embed import components

script_ts, div_ts = components(first_plot)
f1 = open('/home/jpre/Documents/DTU/SocialDataViz/Project/final/first_plot.html', 'w')
f1.write(script_ts)
f1.close()

f2 = open('/home/jpre/Documents/DTU/SocialDataViz/Project/final/first_plot_div.html', 'w')
f2.write(div_ts)
f2.close()

## Creating a tile plot for one day to show

In [None]:
data_i = GetData(0, 'all', folder='days')
data_i['F'] = data_i.title.apply(CovidRelated)
'other' in data_i.subreddit.unique()

This plot can be misleading, as it appears that Corona is well locked in its community, and that takes __only__ 20% of the posts. That is as False as it gets fo two different reasons:

- first of all, 20% of r/all is quite a chunk, considering that those posts before were full with other content. Corona did not exist before that, so now it has its own subreddit and takes 20% ish of the relevance.

- Second, the other categories are talking corona separately as well, as it affected all layers of our internet culture.

So let's address this step by step:

## Infection by coummunity

In [59]:
output_notebook()

items = [] # items for legend
lines ={} # bar-information for plotting
days = list(range(106))

rngsize = 10
TOOLS = "hover,save,reset"

List_of_plots = []
entertainment_communities = ['movies', 'television', 'sports']
news_communities = ['worldnews', 'news', 'politics']
edu_communities = ['science', 'technology', 'economics']
# discussion_communities = ['askreddit', 'ama', 'askscience']
humor_communities = ['memes','funny', 'humor']
#country_communities = ['spain', 'denmark', 'italy']

communities_to_plot = entertainment_communities + news_communities + edu_communities # + humor_communities

for indx,i in enumerate(communities_to_plot):
    
    p = figure(plot_width=int(width_main/3), plot_height=130,x_range=(0,105), y_range=(0,1),
               tools = TOOLS, title = 'r/'+ str(i))
    
    df_ts = GetDF(sr=i,folder='days')
    
    df_ts['first_EU_case'] = [3*int(x>=24)-1 for x in range(df_ts.shape[0])]
    df_ts['is_pandemic'] = [3*int(x>=69)-1 for x in range(df_ts.shape[0])]
    df_ts['boris_in_hospital'] = [3*int(x>=95)-1 for x in range(df_ts.shape[0])]
    
    source = ColumnDataSource(data=df_ts)
    
    p.varea(y1='f',y2='zero', x='timestep', color=mc[0], source=source, alpha = 1)
    
    for event in Events:
        y_event = df_Events[df_Events.event == event].y_line.values[0]
        day_event = df_Events[df_Events.event == event].day.values[0]
        p.line(x=[day_event, day_event], y = [0,y_event],
               line_color = c_event, line_width = w_event, line_dash = dash_event)
        
    highlight_communities = ['worldnews', 'economics', 'science', 'politics']

    if i not in highlight_communities:
        high_box = BoxAnnotation(bottom=0, top = 1,  fill_alpha=0.0, fill_color='white')
        p.add_layout(high_box)

    
    # Adjusting plot parameters
    #p.x_range.range_padding = 0.05
    p.xaxis.axis_label = ""
    p.yaxis.axis_label = ""
    #p.yaxis.visible = False
    p.xaxis.visible = False
    p.yaxis.visible = True
    p.yaxis.axis_line_width = 2
    p.yaxis.ticker = []
    p.title.text_font_size = '16pt'
    p.grid.visible = False
    p.background_fill_color = color_dict[dictionaries[i]]
    p.background_fill_alpha = 1
    
    InvTri = InvertedTriangle(x="day", y="y_tri_inv", size=12, line_color=None, fill_color='black')
    p.add_glyph(ColumnDataSource(data=df_Events), InvTri)  
        
    List_of_plots.append(p)
    
    
p_all = gridplot(List_of_plots, ncols=3)

show(p_all)

In [118]:
script_ts, div_ts = components(p_all)
f1 = open('/home/jpre/Documents/DTU/SocialDataViz/Project/final/second_plot.html', 'w')
f1.write(script_ts)
f1.close()

f2 = open('/home/jpre/Documents/DTU/SocialDataViz/Project/final/second_plot_div.html', 'w')
f2.write(div_ts)
f2.close()

## Infection detailed matrix

In [186]:
from bokeh.io import show
from bokeh.models import BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter
from bokeh.plotting import figure, output_notebook, show

#https://jakevdp.github.io/PythonDataScienceHandbook/04.08-multiple-subplots.html

TOOLS = "hover,save,reset"
#TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

List_of_plots = []
N = 100
for week_counter in range(105):

    data_i = GetData(week_counter, 'worldnews', folder='days', categories = dictionaries)
    data_i = data_i.sample(N)

    all_categories = []
    for c in data_i.subreddit.tolist():
        new_c = c.lower()
        if c.lower() in dictionaries.keys():
            new_c = dictionaries[new_c]
        else:
            new_c = 'other'
        all_categories.append(new_c)

    Titles = data_i.title.tolist()
    date_i = data_i.date_char.tolist()
    time_of_post = data_i.time_utc.tolist()

    df = pd.DataFrame(list(zip(all_categories,Titles, time_of_post)), columns = ['cat','title', 'timepost'])

    df['col'] = [color_dict[cat] for cat in df.cat.tolist()]
    df['F'] = df.title.apply(CovidRelated)
    df.loc[df.F > 0,'col'] = color_dict['coronavirus']
    df = df.sort_values(by=['cat'])

    df.head()


    rngsize = int(np.sqrt(N))
    axis_range = [str(n) for n in range(rngsize)]
    X, Y = np.meshgrid(range(rngsize), range(rngsize))
    x_ = [item+0.5 for sublist in X for item in sublist]
    y_ = [item+0.5 for sublist in Y for item in sublist]

    df['x'] = x_
    df['y'] = y_

    # # NO TITLE
    edgesize = 170
    p_i = figure(x_range=axis_range, y_range=axis_range,
               x_axis_location="above", plot_width=edgesize, plot_height=edgesize,
               tools=TOOLS, toolbar_location='below',tooltips=[('title', '@title')])

    p_i.axis.visible = False
    p_i.rect(x="x", y="y", width=1, height=1,
           source=df,
           fill_color='col',#{'field': 'cat', 'transform': mapper},
           line_color='white', line_width = 1.2)


    List_of_plots.append(p_i)


output_notebook()

p_all = gridplot(List_of_plots, ncols=10)

show(p_all)

In [187]:
script_ts, div_ts = components(p_all)
f1 = open('/home/jpre/Documents/DTU/SocialDataViz/Project/final/third_plot.html', 'w')
f1.write(script_ts)
f1.close()

f2 = open('/home/jpre/Documents/DTU/SocialDataViz/Project/final/third_plot_div.html', 'w')
f2.write(div_ts)
f2.close()

## Weekly analysis

In [157]:
df1 = data5_groups[:14]
df1['dummy'] = 1
mean_categories = df1.groupby(['dummy']).mean().to_dict()
mean_categories
# columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


{'coronavirus': {1: 0.0},
 'educational': {1: 0.023939082113893128},
 'entertainment': {1: 0.13579605345293014},
 'humor': {1: 0.26687683527520295},
 'news/politics': {1: 0.02980125142443829},
 'other': {1: 0.5435867777335356},
 'zero': {1: 0},
 'week': {1: 3}}

In [188]:
from sklearn import preprocessing
from statsmodels.nonparametric.smoothers_lowess import lowess

weeks_wo_covid = list(range(14))
# data_by_week = data_by_week.drop(columns=['coronavirus'])

df1 = data5_groups[:14]
df1['dummy'] = 1
mean_categories = df1.groupby(['dummy']).mean().to_dict()

df2 = data5_groups

for column in ['entertainment', 'humor', 'news/politics', 'educational']:
    
    data3 = df2.reset_index()
    data3[column] = data3[column].div(mean_categories[column][1])
    
    x1 = data3.DayCount.tolist()
    y1 = data3[column].tolist()

    smooth_points = lowess(exog = x1, endog = y1, frac = 0.4)

    x1 = np.transpose(smooth_points)[0]
    y1_smooth = np.transpose(smooth_points)[1]
    
    smooth_column = column + '_sm'
    df2[column] = data3[column]
    df2[smooth_column] = y1_smooth

df2 = df2.reset_index()
# x = data_by_week.values #returns a numpy array
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# df = pd.DataFrame(x_scaled)

# df = df.rename(columns = dict(zip(range(4) , data_by_week.columns.tolist()))).reset_index()
# df
# df.plot()

output_notebook()

TOOLS = "hover,save,reset"

p3 = figure(plot_width=900, plot_height=300, title = 'Trend', x_range=(0,105), y_range=(-1.5,3.5), tools = TOOLS)

items = [] # items for legend
lines = {} # bar-information for plotting

source = ColumnDataSource(data=df2)

for column,color in zip(list(color_dict.keys()),list(color_dict.values())):
    if(column not in ['coronavirus','other']):
        #p3.line(x='week', y = column,color=color, source=source, alpha = 1,  line_width = 2)
        r = p3.circle(x='DayCount', y = column, color=color, source=source,
                      line_width = 2, alpha = 0.3)
        colname = column + '_sm'
        lines[colname] = p3.line(x='DayCount', y = colname, source=source,
                           color=color, alpha = 0.9, line_width = 3)
        items.append((colname, [lines[colname]]))
# hatch_pattern = '/'

for event in Events:
    y_event = df_Events[df_Events.event == event].y_line.values[0]
    day_event = df_Events[df_Events.event == event].day.values[0]
    p3.line(x=[day_event, day_event], y = [-2,4],
           line_color = c_event, line_width = w_event, line_dash = dash_event)

df_Events_plus = df_Events
df_Events_plus.y_tri_inv = 3.5
InvTri = InvertedTriangle(x="day", y="y_tri_inv", size=16, line_color=None, fill_color='black')
p3.add_glyph(ColumnDataSource(data=df_Events_plus), InvTri)  
        

 # Adjusting plot parameters
p3.xaxis.axis_label = "Day"
p3.yaxis.axis_label = "Topics evolution"
p3.axis.visible = False
p3.grid.visible = False
p3.background_fill_color = "white"
p3.background_fill_alpha = 1

 # Adding horizontal line in 1
hline = Span(location=1, dimension='width', line_color='grey', line_width=1)
p3.renderers.extend([hline])

legend = Legend(items=items)
p3.add_layout(legend, 'right')
p3.legend.orientation = "vertical"
p3.legend.background_fill_color = None
p3.legend.background_fill_alpha = 0
p3.legend.border_line_width = 0
p3.legend.click_policy="hide"


show(p3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# from bokeh.embed import components

# script, div = components(p_all)
# f1 = open('/home/jpre/Documents/DTU/SocialDataViz/Project/worldnews_daily_script.html', 'w')
# f1.write(script)
# f1.close()

# f2 = open('/home/jpre/Documents/DTU/SocialDataViz/Project/worldnews_daily_div.html', 'w')
# f2.write(div)
# f2.close()

In [49]:
List_of_DFs_2019 = [GetData(w,'all',folder='days', categories= dictionaries, year='2019') for w in range(70)]   
data_all_2019 = pd.concat(List_of_DFs_2019)
data_by_subreddit_2019 = pd.pivot_table(data_all_2019, index=['DayCount'], aggfunc='size', columns=['subreddit']).fillna(0)
totals2019 = data_by_subreddit_2019.sum(axis=1)
data_by_subreddit_2019 = data_by_subreddit_2019.divide(totals2019, axis=0) # finding the probabilities
data_groups2019 = data_by_subreddit_2019.groupby({**dict_pan, **dict_humor}, axis = 1).sum()
means2019 = {'humor' : data_groups2019.humor.mean(), 'pan' : 0}
means2019

{'humor': 0.23353077566965602, 'pan': 0}

In [51]:
output_notebook()
from sklearn import preprocessing
from statsmodels.nonparametric.smoothers_lowess import lowess
TOOLS = "hover,save,reset"


List_of_DFs_2019 = [GetData(w,'all',folder='days', categories= dictionaries, year='2019') for w in range(70)]   
data_all_2019 = pd.concat(List_of_DFs_2019)
data_by_subreddit_2019 = pd.pivot_table(data_all_2019, index=['DayCount'], aggfunc='size', columns=['subreddit']).fillna(0)
totals2019 = data_by_subreddit_2019.sum(axis=1)
data_by_subreddit_2019 = data_by_subreddit_2019.divide(totals2019, axis=0) # finding the probabilities
data_groups2019 = data_by_subreddit_2019.groupby({**dict_pan, **dict_humor}, axis = 1).sum()
means2019 = {'humor' : data_groups2019.humor.mean(), 'pan' : 0}


List_of_DFs = [GetData(w,'all',folder='days') for w in range(106)]   
data_all = pd.concat(List_of_DFs)

data_by_subreddit = pd.pivot_table(data_all, index=['DayCount'], aggfunc='size', columns=['subreddit']).fillna(0)
totals = data_by_subreddit.sum(axis=1)
data_by_subreddit = data_by_subreddit.divide(totals, axis=0) # finding the probabilities
data_groups = data_by_subreddit.groupby({**dict_pan, **dict_humor}, axis = 1).sum()
data_groups['zero'] = 0
data_groups.head()

df2 = data_groups

for column in ['humor', 'pan']:
    
    data3 = df2.reset_index()
    data3[column] = data3[column] - means2019[column]
    
    x1 = data3.DayCount.tolist()
    y1 = data3[column].tolist()

    smooth_points = lowess(exog = x1, endog = y1, frac = 0.4)

    x1 = np.transpose(smooth_points)[0]
    y1_smooth = np.transpose(smooth_points)[1]
    
    smooth_column = column + '_sm'
    df2[column] = data3[column]
    df2[smooth_column] = y1_smooth

df2 = df2.reset_index()


p4 = figure(plot_width=900, plot_height=300, title = 'Trend', x_range=(0,105), y_range=(-1,1), tools = TOOLS)

items = [] # items for legend
lines = {} # bar-information for plotting

source = ColumnDataSource(data=df2)

r = p4.circle(x='DayCount', y = 'humor', color=color_dict['humor'], source=source,line_width = 2, alpha = 0.5)
colname = 'humor_sm'
lines[colname] = p4.line(x='DayCount', y = colname, source=source, color=color_dict['humor'], alpha = 0.9, line_width = 3)
items.append((colname, [lines[colname]]))
r = p4.circle(x='DayCount', y = 'pan', color='purple', source=source, line_width = 2, alpha = 0.8)

for event in Events:
    y_event = df_Events[df_Events.event == event].y_line.values[0]
    day_event = df_Events[df_Events.event == event].day.values[0]
    p4.line(x=[day_event, day_event], y = [-2,4],
           line_color = c_event, line_width = w_event, line_dash = dash_event)

df_Events_plus = df_Events
df_Events_plus.y_tri_inv = 3.5
InvTri = InvertedTriangle(x="day", y="y_tri_inv", size=16, line_color=None, fill_color='black')
p4.add_glyph(ColumnDataSource(data=df_Events_plus), InvTri)  
        

 # Adjusting plot parameters
p4.xaxis.axis_label = "Day"
p4.yaxis.axis_label = "Topics evolution"
p4.axis.visible = False
p4.grid.visible = False
p4.background_fill_color = "white"
p4.background_fill_alpha = 1

 # Adding horizontal line in 1
hline = Span(location=0, dimension='width', line_color='grey', line_width=1)
p4.renderers.extend([hline])

legend = Legend(items=items)
p4.add_layout(legend, 'right')
p4.legend.orientation = "vertical"
p4.legend.background_fill_color = None
p4.legend.background_fill_alpha = 0
p4.legend.border_line_width = 0
p4.legend.click_policy="hide"


show(p4)

## Trial with the 1,1 line

In [179]:

df1 = data5_groups[:14]
df1['dummy'] = 1
mean_categories = df1.groupby(['dummy']).mean().to_dict()

df2 = data5_groups

for column in ['entertainment', 'humor', 'news/politics', 'educational']:
    
    data3 = df2.reset_index()
    data3[column] = data3[column].div(mean_categories[column][1])
    
    x1 = data3.DayCount.tolist()
    y1 = data3[column].tolist()

    smooth_points = lowess(exog = x1, endog = y1, frac = 0.4)

    x1 = np.transpose(smooth_points)[0]
    y1_smooth = np.transpose(smooth_points)[1]
    
    smooth_column = column + '_sm'
    df2[column] = data3[column]
    df2[smooth_column] = y1_smooth

df2 = df2.reset_index()

TOOLS = "hover,save,reset"

p3 = figure(plot_width=900, plot_height=300, title = 'Trend', x_range=(0,105), y_range=(-1.5,3.5), tools = TOOLS)

items = [] # items for legend
lines = {} # bar-information for plotting

source = ColumnDataSource(data=df2)

for column,color in zip(list(color_dict.keys()),list(color_dict.values())):
    if(column not in ['coronavirus','other']):
        #p3.line(x='week', y = column,color=color, source=source, alpha = 1,  line_width = 2)
        r = p3.circle(x='DayCount', y = column, color=color, source=source,
                      line_width = 2, alpha = 0.3)
        colname = column + '_sm'
        lines[colname] = p3.line(x='DayCount', y = colname, source=source,
                           color=color, alpha = 0.9, line_width = 3)
        items.append((colname, [lines[colname]]))
# hatch_pattern = '/'

for event in Events:
    y_event = df_Events[df_Events.event == event].y_line.values[0]
    day_event = df_Events[df_Events.event == event].day.values[0]
    p3.line(x=[day_event, day_event], y = [-2,4],
           line_color = c_event, line_width = w_event, line_dash = dash_event)

df_Events_plus = df_Events
df_Events_plus.y_tri_inv = 3.5
InvTri = InvertedTriangle(x="day", y="y_tri_inv", size=16, line_color=None, fill_color='black')
p3.add_glyph(ColumnDataSource(data=df_Events_plus), InvTri)  
        

 # Adjusting plot parameters
p3.xaxis.axis_label = "Day"
p3.yaxis.axis_label = "Topics evolution"
p3.axis.visible = False
p3.grid.visible = False
p3.background_fill_color = "white"
p3.background_fill_alpha = 1

 # Adding horizontal line in 1
hline = Span(location=1, dimension='width', line_color='grey', line_width=1)
p3.renderers.extend([hline])

legend = Legend(items=items)
p3.add_layout(legend, 'right')
p3.legend.orientation = "vertical"
p3.legend.background_fill_color = None
p3.legend.background_fill_alpha = 0
p3.legend.border_line_width = 0
p3.legend.click_policy="hide"


show(p3)

73

In [181]:
past_year = 2019

folder = ('/home/jpre/Documents/DTU/SocialDataViz/Project/{0}/subreddits/all/days/').format(past_year)
column_names = ['time_utc', 'epoch_utc', 'time_local', 'epoch_local', 'id', 'score', 'subreddit', 'ncomments']

ndays = len(os.listdir(folder))

List_of_DFs = [GetData(w,'all',folder='days', categories= dictionaries, year='2019') for w in range(ndays)]   

data_all_past = pd.concat(List_of_DFs)

data5_by_subreddit_past = pd.pivot_table(data_all_past, index=['DayCount'], aggfunc='size', columns=['subreddit']).fillna(0)
totals = data5_by_subreddit_past.sum(axis=1)
data5_by_subreddit_past = data5_by_subreddit_past.divide(totals, axis=0) # finding the probabilities

data5_groups_past = data5_by_subreddit_past.groupby(dictionaries, axis = 1).sum()
data5_groups_past['zero'] = 0
data5_groups_past.head()


List_of_plots4 = []
categories_to_plot = ['educational', 'entertainment', 'humor', 'news/politics']
for cat in categories_to_plot:
    p4 = figure(plot_width=200, plot_height=200, title = cat, x_range=(0,0.4), y_range=(0,0.4))
    data2020 = data5_groups[cat].tolist()
    data2019 = data5_groups_past[cat].tolist()
    p4.circle(data2019,data2020[:len(data2019)])
    
    List_of_plots4.append(p4)
    
show(gridplot(List_of_plots4, ncols = 2))

In [173]:
data5_groups.humor.tolist

<bound method IndexOpsMixin.tolist of DayCount
0      0.125000
1      0.345088
2      0.422500
3      0.403448
4      0.431472
         ...   
101    0.325815
102    0.372500
103    0.342500
104    0.362500
105    0.347500
Name: humor, Length: 106, dtype: float64>

## OTHER TRIALS

In [None]:
from bokeh.models import BoxAnnotation, Label, InvertedTriangle, Text

output_notebook()
TOOLS = "hover,save,reset"
data_test = data5_groups
data_test['zero'] = 0
source = ColumnDataSource(data=data_test)


TOOLTIPS = [('',"@description")]

p = figure(plot_width=1000, plot_height=300, title = None, x_range=(0,105), y_range=(0,1), tools = "")

# https://javier.xyz/cohesive-colors/
days = list(range(106))
mc = ['hotpink', 'slateblue', 'gold', 'seagreen','indianred']
columns = ['coronavirus','entertainment', 'humor','news/politics', 'educational']

color_dict = dict(zip(columns, mc))

items = [] # items for legend
line ={} # bar-information for plotting

for category in columns:
    #data_test[category] = data_test[category].cumsum()
    #data_test[category] = data_test[category].div(max(data_test[category].values))
    source = ColumnDataSource(data=data_test)
     # Selecting hour and probabilities for plotting category i
     # adding bar for category i
    line[category] = p.varea(y1 = 'zero', y2 = category, x='DayCount', source=source,
                     muted_color = color_dict[category], muted_alpha = 0.5, muted = True,
                     color = color_dict[category], alpha = 0.9)    
    items.append((category, [line[category]]))

legend = Legend(items=items, location=(0, 10))
p.add_layout(legend, 'right')
p.legend.click_policy="hide"


show(p)