#  GTA News 50 Clusters Chart Using Packed Circles Rendering v.2

## Load 50 cluster dataframe

In [1]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from collections import Counter

# directories
dir_path = os.getcwd()
#print('Working dir: ' + dir_path)

local_path = dir_path + '\\..\\gta-news\\doc2vec\data\\'
df = pd.read_pickle(local_path+'backup'+'-gta.50'+'.pickle')

In [2]:
df.head()

Unnamed: 0,date,title,url,cluster,vector
0,2019-04-20,"""this is why we can't have nice things in nyc,...",http://dagblog.com/reader-blogs/why-we-cant-ha...,13,"[-0.47769657, 0.3672163, 0.23536347, 0.5756423..."
1,2019-04-20,rural book borrowing in peril as libraries sla...,http://easternontarionetwork.com/2019/04/20/ru...,13,"[-0.5722261, -0.26479113, -0.1152498, 0.664171..."
2,2019-04-20,"""discussing canada's new us-focused cannabis e...",http://www.benzinga.com/markets/cannabis/19/04...,11,"[0.54588157, -0.26139393, -0.18809983, -0.0384..."
3,2019-04-20,serea restaurant slated to open in hotel del c...,http://www.coronadonewsca.com/news/coronado_ho...,25,"[-0.28711024, -0.2106441, -0.61289483, 0.04533..."
4,2019-04-20,federal trial of vernon man accused of abducti...,http://www.courant.com/news/connecticut/hc-new...,46,"[0.062113207, 0.0157832, 0.23044105, 0.4720517..."


## Get cluster word counts from titles

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('../gta-news/doc2vec'))
if module_path not in sys.path:
    sys.path.append(module_path)

import d2v_utils
skip_terms =['toronto','canada','canadian','ontario']
cluster_descr = []
clusters = df.groupby(['cluster'])['title']
for cluster, titles in clusters:
    #print("\nCluster: ", cluster)
    filtered_words = []
    for title in titles:
        t = title[0:-4]
        #print(">>>", t)
        tokens = d2v_utils.prepare_text_for_lda(t)
        tokens = [word for word in tokens if word not in skip_terms and not word.isdigit()]
        #print("  >", tokens)
        filtered_words = filtered_words + tokens
    count = Counter(filtered_words)
    current_clust_descr = count.most_common()[:10] 
    cluster_descr.append(current_clust_descr)

clust_num = len(cluster_descr)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ibaranov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ibaranov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(cluster_descr[0][0:5])
print(cluster_descr[1][0:5])
print(cluster_descr[2][0:5])
print(cluster_descr[3][0:5])
print(cluster_descr[4][0:5])
print(cluster_descr[5][0:5])

[('announce', 195), ('result', 47), ('cannabis', 44), ('project', 43), ('update', 43)]
[('refugee', 54), ('asylum', 47), ('snowden', 46), ('shelter', 45), ('grant', 30)]
[('weather', 22), ('winter', 21), ('storm', 16), ('school', 11), ('sweep', 9)]
[('cannabis', 69), ('store', 26), ('legal', 17), ('medical', 14), ('illegal', 14)]
[('woman', 21), ('survivor', 18), ('family', 16), ('holocaust', 13), ('years', 13)]
[('research', 19), ('university', 18), ('innovation', 14), ('better', 13), ('company', 13)]


## Load cluster coordinates generated by circle packing

In [5]:
local_path = dir_path + '\\..\\dev\\packing\\circle_pack\\'
dfCPack = pd.read_csv(local_path+'frameSequence.3.csv')
dfCPack.head()

Unnamed: 0,day,date,cluster,x,y,radius,transition
0,21,2019-01-21,0,950.6415,495.21707,162.16217,0
1,21,2019-01-21,14,1036.7328,639.83203,162.16217,0
2,21,2019-01-21,42,1118.3309,493.97827,162.16217,0
3,21,2019-01-21,18,792.79425,362.9673,146.62161,0
4,21,2019-01-21,20,640.2112,498.38596,146.62161,0


## Sort

In [6]:
dfChart = dfCPack.sort_values(['date','transition','radius'])
dfChart.head()

Unnamed: 0,day,date,cluster,x,y,radius,transition
37,21,2019-01-21,4,869.0561,856.90643,107.77027,0
38,21,2019-01-21,16,1038.5347,832.351,107.77027,0
39,21,2019-01-21,17,1446.1097,654.7138,107.77027,0
40,21,2019-01-21,19,1484.6389,436.51022,107.77027,0
41,21,2019-01-21,22,1385.3837,314.13232,107.77027,0


## Plot

In [7]:
# defines wordcloud circular mask positioned on image
def create_circular_mask(w, h, center=None, radius=None):
    if center is None: # use the middle of the image
        center = [int(w/2), int(h/2)]
    if radius is None: # use the smallest distance between the center and image walls
        radius = min(center[0], center[1], w-center[0], h-center[1])
    y, x = np.ogrid[:h, :w]
    mask = (y - center[1]) ** 2 + (x - center[0]) ** 2 > radius ** 2
    mask = 255 * mask.astype(int)
    return mask

In [106]:
import colorsys
import random
random.seed(111)
 
def get_colors(n):
  ret = []
  for i in range(n):
    hue = random.random()
    lightness  = 0.3 + random.random() * 0.05
    saturation = 0.9 + random.random() * 0.1
    rgb = colorsys.hls_to_rgb(hue, lightness, saturation)
    ret.append((int(rgb[0] * 256),int(rgb[1] * 256),int(rgb[2] * 256))) 
  return ret

colors = get_colors(clust_num)

print(colors[0])
print(colors[1])
print(colors[2])
print(colors[30])

(152, 0, 158)
(3, 160, 154)
(151, 153, 4)
(169, 122, 7)


##  Render wordclouds of biggest sizes separately

### Max radius for each cluster

In [84]:
dfMaxRadius = dfCPack[dfCPack.transition == 0][['cluster','radius']]\
    .groupby(['cluster']).agg('max').reset_index()
dfMaxRadius.cluster = dfMaxRadius.cluster.astype(int)
dfMaxRadius.head()

Unnamed: 0,cluster,radius
0,0,239.86487
1,1,302.02704
2,2,193.24324
3,3,154.39189
4,4,154.39189


###  Array max images for each cluster

In [107]:
from wordcloud import WordCloud
import numpy as np
import math

# Screen size: 4K 3840 x 2160 ?
img_pix_x = 3840
img_pix_y = 2160

pack_img_pix_x = 1778
pack_img_pix_y = 1000

x_scale = 1.0 #* img_pix_x / pack_img_pix_x
y_scale = 1.0 #* img_pix_y / pack_img_pix_y

images = []

scale = x_scale
for index, row in dfMaxRadius.iterrows():
    
    cluster = int(row.cluster)
    radius = math.ceil(row.radius) * scale
    print("...preparing image for cluster: ", cluster, ", diameter: ", radius)

    # make the cloud
    mask = create_circular_mask(radius, radius)
    wc = WordCloud(background_color="white", random_state=33, 
                   mask=mask,
                   color_func=lambda *args, **kwargs: colors[cluster])

    wc.generate_from_frequencies(dict(cluster_descr[cluster]))

    # make white pix transparent
    img = wc.to_image().convert('RGBA')
    datas = img.getdata()
    newData = []
    for item in datas:
        if item[0] == 255 and item[1] == 255 and item[2] == 255:
            newData.append((255, 255, 255, 0))
        else:
            newData.append(item)
    img.putdata(newData)
    
    #store
    img.save("wc_circles\circle.{0}.png".format(index), "PNG")
    images.append(img)


...preparing image for cluster:  0 , diameter:  240.0
...preparing image for cluster:  1 , diameter:  303.0
...preparing image for cluster:  2 , diameter:  194.0
...preparing image for cluster:  3 , diameter:  155.0
...preparing image for cluster:  4 , diameter:  155.0
...preparing image for cluster:  5 , diameter:  194.0
...preparing image for cluster:  6 , diameter:  334.0
...preparing image for cluster:  7 , diameter:  186.0
...preparing image for cluster:  8 , diameter:  287.0
...preparing image for cluster:  9 , diameter:  217.0
...preparing image for cluster:  10 , diameter:  217.0
...preparing image for cluster:  11 , diameter:  240.0
...preparing image for cluster:  12 , diameter:  202.0
...preparing image for cluster:  13 , diameter:  256.0
...preparing image for cluster:  14 , diameter:  256.0
...preparing image for cluster:  15 , diameter:  225.0
...preparing image for cluster:  16 , diameter:  225.0
...preparing image for cluster:  17 , diameter:  202.0
...preparing image f

In [78]:
# check
import matplotlib.pyplot as plt
fig = plt.figure()
plt.imshow(images[49])
plt.axis('off')
plt.show()

<Figure size 640x480 with 1 Axes>

## Render Frames and Transitions

In [109]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
import numpy.ma as ma

from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw 

x_scale = 1.0 * img_pix_x / pack_img_pix_x
y_scale = 1.0 * img_pix_y / pack_img_pix_y

# group by days
grouped = dfChart.groupby(['date', 'transition'])
for date, group in grouped:
    date_str = date[0]
    transition = date[1]
    
    # skip odd ones, too jerky otherwize
    if transition%2 != 0:
        continue
    
    print (date_str, ", transition ", transition)
    
    # generate combined image of one day
    first = True
    for row_index, row in group.iterrows():
        radius = x_scale * row.radius
        if radius > 0:
            # creating background
            if first:
                background = Image.new('RGB', (img_pix_x,img_pix_y), (255,255,255))
                first = False
                continue

            # resize current image to currend radius
            img = images[row.cluster]
            r = math.ceil(radius)
            img = img.resize((r, r), resample=Image.ANTIALIAS)

            # place
            pos = (math.ceil(x_scale*row.x - r/2), math.ceil(y_scale*row.y - r/2))

            # combine images
            background.paste(img, pos, img)
        
    # print date
    draw = ImageDraw.Draw(background)
    font = ImageFont.truetype("arial.ttf", 72)
    draw.text((math.ceil(x_scale*50), math.ceil(x_scale*50)),date_str,(55,55,55), font=font)

    # save one day combine image
    background.save("wc5/wc.{0}.{1}.jpg".format(date_str,transition), "JPEG")


2019-01-21 , transition  0
2019-01-21 , transition  2
2019-01-21 , transition  4
2019-01-21 , transition  6
2019-01-21 , transition  8
2019-01-22 , transition  0
2019-01-22 , transition  2
2019-01-22 , transition  4
2019-01-22 , transition  6
2019-01-22 , transition  8
2019-01-23 , transition  0
2019-01-23 , transition  2
2019-01-23 , transition  4
2019-01-23 , transition  6
2019-01-23 , transition  8
2019-01-24 , transition  0
2019-01-24 , transition  2
2019-01-24 , transition  4
2019-01-24 , transition  6
2019-01-24 , transition  8
2019-01-25 , transition  0
2019-01-25 , transition  2
2019-01-25 , transition  4
2019-01-25 , transition  6
2019-01-25 , transition  8
2019-01-26 , transition  0
2019-01-26 , transition  2
2019-01-26 , transition  4
2019-01-26 , transition  6
2019-01-26 , transition  8
2019-01-27 , transition  0
2019-01-27 , transition  2
2019-01-27 , transition  4
2019-01-27 , transition  6
2019-01-27 , transition  8
2019-01-28 , transition  0
2019-01-28 , transition  2
2

2019-03-22 , transition  6
2019-03-22 , transition  8
2019-03-23 , transition  0
2019-03-23 , transition  2
2019-03-23 , transition  4
2019-03-23 , transition  6
2019-03-23 , transition  8
2019-03-24 , transition  0
2019-03-24 , transition  2
2019-03-24 , transition  4
2019-03-24 , transition  6
2019-03-24 , transition  8
2019-03-25 , transition  0
2019-03-25 , transition  2
2019-03-25 , transition  4
2019-03-25 , transition  6
2019-03-25 , transition  8
2019-03-26 , transition  0
2019-03-26 , transition  2
2019-03-26 , transition  4
2019-03-26 , transition  6
2019-03-26 , transition  8
2019-03-27 , transition  0
2019-03-27 , transition  2
2019-03-27 , transition  4
2019-03-27 , transition  6
2019-03-27 , transition  8
2019-03-28 , transition  0
2019-03-28 , transition  2
2019-03-28 , transition  4
2019-03-28 , transition  6
2019-03-28 , transition  8
2019-03-29 , transition  0
2019-03-29 , transition  2
2019-03-29 , transition  4
2019-03-29 , transition  6
2019-03-29 , transition  8
2