#  GTA News 50 Clusters Chart Using Packed Circles Rendering v.3

## Load 50 clusters dataframe

In [None]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from collections import Counter

# directories
dir_path = os.getcwd()
#print('Working dir: ' + dir_path)

local_path = dir_path + '\\..\\gta-news\\doc2vec\data\\'
df = pd.read_pickle(local_path+'backup'+'-gta.50'+'.pickle')

In [None]:
df.head()

## Get cluster word counts from titles

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../gta-news/doc2vec'))
if module_path not in sys.path:
    sys.path.append(module_path)

import d2v_utils
skip_terms =['toronto','canada','canadian','ontario']
cluster_descr = []
clusters = df.groupby(['cluster'])['title']
for cluster, titles in clusters:
    #print("\nCluster: ", cluster)
    filtered_words = []
    for title in titles:
        t = title[0:-4]
        #print(">>>", t)
        tokens = d2v_utils.prepare_text_for_lda(t)
        tokens = [word for word in tokens if word not in skip_terms and not word.isdigit()]
        #print("  >", tokens)
        filtered_words = filtered_words + tokens
    count = Counter(filtered_words)
    current_clust_descr = count.most_common()[:10] 
    cluster_descr.append(current_clust_descr)

clust_num = len(cluster_descr)

In [None]:
print(cluster_descr[0][0:5])
print(cluster_descr[1][0:5])
print(cluster_descr[2][0:5])
print(cluster_descr[3][0:5])
print(cluster_descr[4][0:5])
print(cluster_descr[5][0:5])

## Load cluster coordinates generated by circle packing

In [None]:
local_path = dir_path + '\\..\\dev\\packing\\circle_pack\\'
dfCPack = pd.read_csv(local_path+'frameSequence.3.csv')
dfCPack.head()

## Sort

In [None]:
dfChart = dfCPack.sort_values(['date','transition','radius'])
dfChart.head()

## Plot

In [None]:
# defines wordcloud circular mask positioned on image
def create_circular_mask(w, h, center=None, radius=None):
    if center is None: # use the middle of the image
        center = [int(w/2), int(h/2)]
    if radius is None: # use the smallest distance between the center and image walls
        radius = min(center[0], center[1], w-center[0], h-center[1])
    y, x = np.ogrid[:h, :w]
    mask = (y - center[1]) ** 2 + (x - center[0]) ** 2 > radius ** 2
    mask = 255 * mask.astype(int)
    return mask

In [None]:
import colorsys
import random
random.seed(111)
 
def get_colors(n):
  ret = []
  for i in range(n):
    hue = random.random()
    lightness  = 0.3 + random.random() * 0.05
    saturation = 0.9 + random.random() * 0.1
    rgb = colorsys.hls_to_rgb(hue, lightness, saturation)
    ret.append((int(rgb[0] * 256),int(rgb[1] * 256),int(rgb[2] * 256))) 
  return ret

colors = get_colors(clust_num)

print(colors[0])
print(colors[1])
print(colors[2])
print(colors[30])

##  Render wordclouds of biggest sizes separately

### Max radius for each cluster

In [None]:
dfMaxRadius = dfCPack[dfCPack.transition == 0][['cluster','radius']]\
    .groupby(['cluster']).agg('max').reset_index()
dfMaxRadius.cluster = dfMaxRadius.cluster.astype(int)
dfMaxRadius.head()

###  Array max size WordCloud images for each cluster

In [None]:
from wordcloud import WordCloud
import numpy as np
import math

# Screen size: 4K 3840 x 2160 ?
img_pix_x = 3840
img_pix_y = 2160

pack_img_pix_x = 1778
pack_img_pix_y = 1000

x_scale = 1.0 #* img_pix_x / pack_img_pix_x
y_scale = 1.0 #* img_pix_y / pack_img_pix_y

images = []

scale = x_scale
for index, row in dfMaxRadius.iterrows():
    
    cluster = int(row.cluster)
    radius = math.ceil(row.radius) * scale
    print("...preparing image for cluster: ", cluster, ", diameter: ", radius)

    # make the cloud
    mask = create_circular_mask(radius, radius)
    wc = WordCloud(background_color="white", random_state=33, 
                   mask=mask,
                   color_func=lambda *args, **kwargs: colors[cluster])

    wc.generate_from_frequencies(dict(cluster_descr[cluster]))

    # make white pix transparent
    img = wc.to_image().convert('RGBA')
    datas = img.getdata()
    newData = []
    for item in datas:
        if item[0] == 255 and item[1] == 255 and item[2] == 255:
            newData.append((255, 255, 255, 0))
        else:
            newData.append(item)
    img.putdata(newData)
    
    #store
    img.save("wc_circles\circle.{0}.png".format(index), "PNG")
    images.append(img)


In [None]:
# check
import matplotlib.pyplot as plt
fig = plt.figure()
plt.imshow(images[49])
plt.axis('off')
plt.show()

## Render Frames and Transitions

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
import numpy.ma as ma

from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw 

x_scale = 1.0 * img_pix_x / pack_img_pix_x
y_scale = 1.0 * img_pix_y / pack_img_pix_y

# group by days
grouped = dfChart.groupby(['date', 'transition'])
for date, group in grouped:
    date_str = date[0]
    transition = date[1]
    
    print (date_str, ", transition ", transition)
    
    # combine images of one day
    first = True
    for row_index, row in group.iterrows():
        radius = x_scale * row.radius
        if radius > 0:
            # creating background
            if first:
                background = Image.new('RGB', (img_pix_x,img_pix_y), (255,255,255))
                first = False
                continue

            # resize current image
            img = images[row.cluster]
            r = math.ceil(radius)
            img = img.resize((r, r), resample=Image.ANTIALIAS)

            # place
            pos = (math.ceil(x_scale*row.x - r/2), math.ceil(y_scale*row.y - r/2))

            # combine images
            background.paste(img, pos, img)
        
    # imprint the date
    draw = ImageDraw.Draw(background)
    font = ImageFont.truetype("arial.ttf", 72)
    draw.text((math.ceil(x_scale*50), math.ceil(x_scale*50)),date_str,(55,55,55), font=font)

    # save combined image
    background.save("wc5/wc.{0}.{1}.jpg".format(date_str,transition), "JPEG")
