#  GTA News 50 Clusters Chart Usiing Packed Circles

## Load 50 cluster dataframe

In [None]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from collections import Counter

# directories
dir_path = os.getcwd()
#print('Working dir: ' + dir_path)

local_path = dir_path + '\\..\\gta-news\\doc2vec\data\\'
df = pd.read_pickle(local_path+'backup'+'-gta.50'+'.pickle')

In [None]:
df.head()

## Get cluster descriptions from titles

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../gta-news/doc2vec'))
if module_path not in sys.path:
    sys.path.append(module_path)

import d2v_utils
skip_terms =['toronto','canada','canadian','ontario']
cluster_descr = []
clusters = df.groupby(['cluster'])['title']
for cluster, titles in clusters:
    #print("\nCluster: ", cluster)
    filtered_words = []
    for title in titles:
        t = title[0:-4]
        #print(">>>", t)
        tokens = d2v_utils.prepare_text_for_lda(t)
        tokens = [word for word in tokens if word not in skip_terms and not word.isdigit()]
        #print("  >", tokens)
        filtered_words = filtered_words + tokens
    count = Counter(filtered_words)
    current_clust_descr = count.most_common()[:10] 
    cluster_descr.append(current_clust_descr)

In [None]:
print(cluster_descr[0][0:5])
print(cluster_descr[1][0:5])
print(cluster_descr[2][0:5])
print(cluster_descr[3][0:5])
print(cluster_descr[4][0:5])
print(cluster_descr[5][0:5])

In [None]:
decriptions = []
for row in cluster_descr:
    d=[]
    for token in row:
        d.append(token[0])
    decriptions.append(d)

data = df
description = []
for i, row in df.iterrows():
    description.append(decriptions[row['cluster']])

data['description'] = description

## Prepare the dataset

In [None]:
df.head()

In [None]:
# need dfAggr:
# date, cluster, count, description, x, y
dfAggr = df[['date','cluster','description']].groupby(['date','cluster'])\
    .first().sort_values(['date','cluster']).reset_index()

dfCount = df.groupby(['date','cluster'])['cluster']\
    .agg('count').to_frame('count')\
    .sort_values(['date','cluster']).reset_index()['count']

dfAggr['count'] = dfCount

# create topic string
topics = []
for i, row in dfAggr.iterrows():
    topics.append(', '.join(row['description'][0:2]))
dfAggr['topic'] = topics


# temporary RANDOM
import random
import numpy as np
random.seed(11)
dfAggr['x'] = 0.0
dfAggr['y'] = 0.0
sampl_x = np.random.uniform(low=0.0, high=50.0, size=(50,))
sampl_y = np.random.uniform(low=0.0, high=50.0, size=(50,))
for i, row in dfAggr.iterrows():
    dfAggr.at[i,'x'] = sampl_x[row.cluster]
    dfAggr.at[i,'y'] = sampl_y[row.cluster]

dfAggr.head(10)

## Cleaning dataset

In [None]:
dfChart = dfAggr[['date','cluster','count','topic','x','y']].sort_values(['date','cluster'], ascending=[True,True])
#dfChart = dfChart[dfChart['count'] > 2]

#add 'day' column
new_year_day = pd.Timestamp(year=2019, month=1, day=1)
dfChart['day'] = 0
for i, row in dfChart.iterrows():
    dfChart.at[i,'day'] = (row['date'] - new_year_day).days + 1


dfChart[dfChart.day == 110].head(50)

## Add missing rows (chart is not working well without)

In [None]:
num_clusters = 50

missing_rows = []
cur_day = -1
cl_num = 0
len_total = len(dfChart.index)
print(len_total)
for i, row in dfChart.iterrows():

    # day changed
    if cur_day != row.day:
        cur_day = row.day
        cl_num = 0
    
    # create missing cluster row(s) before row.cluster
    while cl_num != row.cluster:
        desc = ', '.join(decriptions[cl_num][0:2])
        d = pd.DataFrame(
            {'date':[row.date],'cluster':[cl_num],'count':[0],'topic':[desc],
             'x':[sampl_x[cl_num]],'y':[sampl_y[cl_num]],
             'day':[row.day]}
        )
        missing_rows.append(d)
        print ("Created missing row: day:" + str(row.day) + ", cluster:" + str(cl_num) + ", topic:" + desc)
        cl_num += 1
        if cl_num >= num_clusters:
            break

    # create missing cluster row(s) after row.cluster to end
    if i < len_total:
        if i == len_total -1:
            next_day = -1
        else:
            next_day = dfChart.at[i+1,'day']
        if next_day != cur_day:
            cl_num += 1
            while cl_num < num_clusters:
                desc = ', '.join(decriptions[cl_num][0:2])
                d = pd.DataFrame(
                    {'date':[row.date],'cluster':[cl_num],'count':[0],'topic':[desc],
                     'x':[sampl_x[cl_num]],'y':[sampl_y[cl_num]],
                     'day':[row.day]}
                )
                missing_rows.append(d)
                print ("Created missing row: day:" + str(row.day) + ", cluster:" + str(cl_num) + ", topic:" + desc)
                cl_num += 1
                if cl_num >= num_clusters:
                    break
            
    cl_num += 1

In [None]:
dfChart2 = dfChart.append(missing_rows, sort=True)
dfChart2 = dfChart2.sort_values(['day','cluster'], ascending=[True,True])
dfChart2 = dfChart2.reset_index(drop=True)
dfChart2[dfChart2.day == 110].head()

## TODO: Project clustering to 2D using PCA

In [None]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=2)
#principalComponents = pca.fit_transform(x)
#principalDf = pd.DataFrame(data = principalComponents
#             , columns = ['principal component 1', 'principal component 2'])

## Plot

In [None]:
import plotly_express as px
px.scatter(dfChart2, x="x", y="y", animation_frame="day", animation_group="topic",
           size="count", color="topic", hover_name="topic",
           size_max=150, range_x=[-5,55], range_y=[-5,55])