#  GTA News 50 Clusters Chart

## Load 50 cluster dataframe

In [1]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from collections import Counter

# directories
dir_path = os.getcwd()
#print('Working dir: ' + dir_path)

local_path = dir_path + '\\..\\gta-news\\doc2vec\data\\'
df = pd.read_pickle(local_path+'backup'+'-gta.50'+'.pickle')

In [2]:
df.head()

Unnamed: 0,date,title,url,cluster,vector
0,2019-04-20,"""this is why we can't have nice things in nyc,...",http://dagblog.com/reader-blogs/why-we-cant-ha...,13,"[-0.47769657, 0.3672163, 0.23536347, 0.5756423..."
1,2019-04-20,rural book borrowing in peril as libraries sla...,http://easternontarionetwork.com/2019/04/20/ru...,13,"[-0.5722261, -0.26479113, -0.1152498, 0.664171..."
2,2019-04-20,"""discussing canada's new us-focused cannabis e...",http://www.benzinga.com/markets/cannabis/19/04...,11,"[0.54588157, -0.26139393, -0.18809983, -0.0384..."
3,2019-04-20,serea restaurant slated to open in hotel del c...,http://www.coronadonewsca.com/news/coronado_ho...,25,"[-0.28711024, -0.2106441, -0.61289483, 0.04533..."
4,2019-04-20,federal trial of vernon man accused of abducti...,http://www.courant.com/news/connecticut/hc-new...,46,"[0.062113207, 0.0157832, 0.23044105, 0.4720517..."


## Get cluster descriptions from titles

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('../gta-news/doc2vec'))
if module_path not in sys.path:
    sys.path.append(module_path)

import d2v_utils
skip_terms =['toronto','canada','canadian','ontario']
cluster_descr = []
clusters = df.groupby(['cluster'])['title']
for cluster, titles in clusters:
    #print("\nCluster: ", cluster)
    filtered_words = []
    for title in titles:
        t = title[0:-4]
        #print(">>>", t)
        tokens = d2v_utils.prepare_text_for_lda(t)
        tokens = [word for word in tokens if word not in skip_terms and not word.isdigit()]
        #print("  >", tokens)
        filtered_words = filtered_words + tokens
    count = Counter(filtered_words)
    current_clust_descr = count.most_common()[:10] 
    cluster_descr.append(current_clust_descr)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ibaranov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ibaranov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(cluster_descr[0][0:5])
print(cluster_descr[1][0:5])
print(cluster_descr[2][0:5])
print(cluster_descr[3][0:5])
print(cluster_descr[4][0:5])
print(cluster_descr[5][0:5])

[('announce', 195), ('result', 47), ('cannabis', 44), ('update', 43), ('project', 43)]
[('refugee', 54), ('asylum', 47), ('snowden', 46), ('shelter', 45), ('grant', 30)]
[('weather', 22), ('winter', 21), ('storm', 16), ('school', 11), ('sweep', 9)]
[('cannabis', 69), ('store', 26), ('legal', 17), ('medical', 14), ('illegal', 14)]
[('woman', 21), ('survivor', 18), ('family', 16), ('years', 13), ('holocaust', 13)]
[('research', 19), ('university', 18), ('innovation', 14), ('better', 13), ('company', 13)]


In [5]:
decriptions = []
for row in cluster_descr:
    d=[]
    for token in row:
        d.append(token[0])
    decriptions.append(d)

data = df
description = []
for i, row in df.iterrows():
    description.append(decriptions[row['cluster']])

data['description'] = description

## Prepare the dataset

In [6]:
df.head()

Unnamed: 0,date,title,url,cluster,vector,description
0,2019-04-20,"""this is why we can't have nice things in nyc,...",http://dagblog.com/reader-blogs/why-we-cant-ha...,13,"[-0.47769657, 0.3672163, 0.23536347, 0.5756423...","[health, transit, government, funding, subway,..."
1,2019-04-20,rural book borrowing in peril as libraries sla...,http://easternontarionetwork.com/2019/04/20/ru...,13,"[-0.5722261, -0.26479113, -0.1152498, 0.664171...","[health, transit, government, funding, subway,..."
2,2019-04-20,"""discussing canada's new us-focused cannabis e...",http://www.benzinga.com/markets/cannabis/19/04...,11,"[0.54588157, -0.26139393, -0.18809983, -0.0384...","[cannabis, company, rogen, launch, growth, can..."
3,2019-04-20,serea restaurant slated to open in hotel del c...,http://www.coronadonewsca.com/news/coronado_ho...,25,"[-0.28711024, -0.2106441, -0.61289483, 0.04533...","[restaurant, first, announce, store, market, d..."
4,2019-04-20,federal trial of vernon man accused of abducti...,http://www.courant.com/news/connecticut/hc-new...,46,"[0.062113207, 0.0157832, 0.23044105, 0.4720517...","[police, charge, suspect, arrest, student, mur..."


In [7]:
# need dfAggr:
# date, cluster, count, description, x, y
dfAggr = df[['date','cluster','description']].groupby(['date','cluster'])\
    .first().sort_values(['date','cluster']).reset_index()

dfCount = df.groupby(['date','cluster'])['cluster']\
    .agg('count').to_frame('count')\
    .sort_values(['date','cluster']).reset_index()['count']

dfAggr['count'] = dfCount

# create topic string
topics = []
for i, row in dfAggr.iterrows():
    topics.append(', '.join(row['description'][0:2]))
dfAggr['topic'] = topics


# temporary RANDOM
import random
import numpy as np
random.seed(11)
dfAggr['x'] = 0.0
dfAggr['y'] = 0.0
sampl_x = np.random.uniform(low=0.0, high=50.0, size=(50,))
sampl_y = np.random.uniform(low=0.0, high=50.0, size=(50,))
for i, row in dfAggr.iterrows():
    dfAggr.at[i,'x'] = sampl_x[row.cluster]
    dfAggr.at[i,'y'] = sampl_y[row.cluster]

dfAggr.head(10)

Unnamed: 0,date,cluster,description,count,topic,x,y
0,2019-01-21,0,"[announce, result, cannabis, update, project, ...",8,"announce, result",13.079144,35.752241
1,2019-01-21,1,"[refugee, asylum, snowden, shelter, grant, wom...",2,"refugee, asylum",1.800527,35.000123
2,2019-01-21,2,"[weather, winter, storm, school, sweep, contin...",3,"weather, winter",8.189021,0.808575
3,2019-01-21,3,"[cannabis, store, legal, medical, illegal, fir...",2,"cannabis, store",6.968602,45.099509
4,2019-01-21,4,"[woman, survivor, family, years, holocaust, fu...",1,"woman, survivor",25.714853,28.595907
5,2019-01-21,5,"[research, university, innovation, better, com...",2,"research, university",48.215966,19.941933
6,2019-01-21,6,"[crash, ethiopian, plane, airline, family, vic...",2,"crash, ethiopian",33.204812,42.048249
7,2019-01-21,7,"[court, company, million, crypto, report, laws...",3,"court, company",10.520804,28.999689
8,2019-01-21,9,"[study, researcher, amazon, could, award, arti...",2,"study, researcher",8.027741,40.080224
9,2019-01-21,10,"[health, measles, vaccine, cancer, study, trea...",5,"health, measles",45.93363,44.639564


## Cleaning dataset

In [89]:
dfChart = dfAggr[['date','cluster','count','topic','x','y']].sort_values(['date','cluster'], ascending=[True,True])
#dfChart = dfChart[dfChart['count'] > 2]

#add 'day' column
new_year_day = pd.Timestamp(year=2019, month=1, day=1)
dfChart['day'] = 0
for i, row in dfChart.iterrows():
    dfChart.at[i,'day'] = (row['date'] - new_year_day).days + 1


dfChart[dfChart.day == 110].head(50)

Unnamed: 0,date,cluster,count,topic,x,y,day
3812,2019-04-20,0,2,"announce, result",13.079144,35.752241,110
3813,2019-04-20,3,7,"cannabis, store",6.968602,45.099509,110
3814,2019-04-20,4,1,"woman, survivor",25.714853,28.595907,110
3815,2019-04-20,5,1,"research, university",48.215966,19.941933,110
3816,2019-04-20,8,1,"flight, boeing",49.389714,23.021889,110
3817,2019-04-20,10,1,"health, measles",45.93363,44.639564,110
3818,2019-04-20,11,2,"cannabis, company",30.078601,48.720327,110
3819,2019-04-20,13,5,"health, transit",14.006227,46.163174,110
3820,2019-04-20,14,5,"carbon, government",25.857871,8.86682,110
3821,2019-04-20,17,4,"trump, venezuela",39.968534,9.855437,110


## Add missing rows (chart is not working well without)

In [113]:
num_clusters = 50

missing_rows = []
cur_day = -1
cl_num = 0
len_total = len(dfChart.index)
print(len_total)
for i, row in dfChart.iterrows():

    # day changed
    if cur_day != row.day:
        cur_day = row.day
        cl_num = 0
    
    # create missing cluster row(s) before row.cluster
    while cl_num != row.cluster:
        desc = ', '.join(decriptions[cl_num][0:2])
        d = pd.DataFrame(
            {'date':[row.date],'cluster':[cl_num],'count':[0],'topic':[desc],
             'x':[sampl_x[cl_num]],'y':[sampl_y[cl_num]],
             'day':[row.day]}
        )
        missing_rows.append(d)
        print ("Created missing row: day:" + str(row.day) + ", cluster:" + str(cl_num) + ", topic:" + desc)
        cl_num += 1
        if cl_num >= num_clusters:
            break

    # create missing cluster row(s) after row.cluster to end
    if i < len_total:
        if i == len_total -1:
            next_day = -1
        else:
            next_day = dfChart.at[i+1,'day']
        if next_day != cur_day:
            cl_num += 1
            while cl_num < num_clusters:
                desc = ', '.join(decriptions[cl_num][0:2])
                d = pd.DataFrame(
                    {'date':[row.date],'cluster':[cl_num],'count':[0],'topic':[desc],
                     'x':[sampl_x[cl_num]],'y':[sampl_y[cl_num]],
                     'day':[row.day]}
                )
                missing_rows.append(d)
                print ("Created missing row: day:" + str(row.day) + ", cluster:" + str(cl_num) + ", topic:" + desc)
                cl_num += 1
                if cl_num >= num_clusters:
                    break
            
    cl_num += 1

3842
Created missing row: day:21, cluster:8, topic:flight, boeing
Created missing row: day:21, cluster:15, topic:boycott, vehicle
Created missing row: day:21, cluster:26, topic:attack, mosque
Created missing row: day:21, cluster:43, topic:mcarthur, bruce
Created missing row: day:21, cluster:48, topic:trudeau, lavalin
Created missing row: day:22, cluster:4, topic:woman, survivor
Created missing row: day:22, cluster:6, topic:crash, ethiopian
Created missing row: day:22, cluster:22, topic:autism, program
Created missing row: day:22, cluster:40, topic:condo, building
Created missing row: day:22, cluster:43, topic:mcarthur, bruce
Created missing row: day:22, cluster:48, topic:trudeau, lavalin
Created missing row: day:23, cluster:11, topic:cannabis, company
Created missing row: day:23, cluster:19, topic:police, charge
Created missing row: day:23, cluster:48, topic:trudeau, lavalin
Created missing row: day:24, cluster:6, topic:crash, ethiopian
Created missing row: day:24, cluster:11, topic:ca

Created missing row: day:41, cluster:0, topic:announce, result
Created missing row: day:41, cluster:1, topic:refugee, asylum
Created missing row: day:41, cluster:2, topic:weather, winter
Created missing row: day:41, cluster:3, topic:cannabis, store
Created missing row: day:41, cluster:5, topic:research, university
Created missing row: day:41, cluster:6, topic:crash, ethiopian
Created missing row: day:41, cluster:8, topic:flight, boeing
Created missing row: day:41, cluster:9, topic:study, researcher
Created missing row: day:41, cluster:10, topic:health, measles
Created missing row: day:41, cluster:11, topic:cannabis, company
Created missing row: day:41, cluster:14, topic:carbon, government
Created missing row: day:41, cluster:16, topic:sidewalk, project
Created missing row: day:41, cluster:18, topic:digital, announce
Created missing row: day:41, cluster:25, topic:restaurant, first
Created missing row: day:41, cluster:37, topic:trump, white
Created missing row: day:41, cluster:39, topic:

Created missing row: day:57, cluster:6, topic:crash, ethiopian
Created missing row: day:57, cluster:26, topic:attack, mosque
Created missing row: day:58, cluster:25, topic:restaurant, first
Created missing row: day:58, cluster:43, topic:mcarthur, bruce
Created missing row: day:58, cluster:44, topic:violence, handgun
Created missing row: day:59, cluster:6, topic:crash, ethiopian
Created missing row: day:59, cluster:7, topic:court, company
Created missing row: day:59, cluster:15, topic:boycott, vehicle
Created missing row: day:59, cluster:26, topic:attack, mosque
Created missing row: day:59, cluster:40, topic:condo, building
Created missing row: day:60, cluster:15, topic:boycott, vehicle
Created missing row: day:60, cluster:17, topic:trump, venezuela
Created missing row: day:61, cluster:6, topic:crash, ethiopian
Created missing row: day:61, cluster:10, topic:health, measles
Created missing row: day:61, cluster:11, topic:cannabis, company
Created missing row: day:61, cluster:17, topic:tru

Created missing row: day:76, cluster:27, topic:peterson, jordan
Created missing row: day:76, cluster:33, topic:china, huawei
Created missing row: day:76, cluster:39, topic:leaf, return
Created missing row: day:76, cluster:43, topic:mcarthur, bruce
Created missing row: day:76, cluster:47, topic:commissioner, police
Created missing row: day:76, cluster:48, topic:trudeau, lavalin
Created missing row: day:77, cluster:3, topic:cannabis, store
Created missing row: day:77, cluster:5, topic:research, university
Created missing row: day:77, cluster:6, topic:crash, ethiopian
Created missing row: day:77, cluster:16, topic:sidewalk, project
Created missing row: day:77, cluster:33, topic:china, huawei
Created missing row: day:77, cluster:39, topic:leaf, return
Created missing row: day:77, cluster:42, topic:police, woman
Created missing row: day:77, cluster:43, topic:mcarthur, bruce
Created missing row: day:77, cluster:44, topic:violence, handgun
Created missing row: day:78, cluster:33, topic:china,

Created missing row: day:91, cluster:43, topic:mcarthur, bruce
Created missing row: day:91, cluster:44, topic:violence, handgun
Created missing row: day:91, cluster:49, topic:student, education
Created missing row: day:92, cluster:25, topic:restaurant, first
Created missing row: day:92, cluster:33, topic:china, huawei
Created missing row: day:93, cluster:3, topic:cannabis, store
Created missing row: day:93, cluster:4, topic:woman, survivor
Created missing row: day:93, cluster:22, topic:autism, program
Created missing row: day:93, cluster:24, topic:chinese, student
Created missing row: day:94, cluster:1, topic:refugee, asylum
Created missing row: day:94, cluster:2, topic:weather, winter
Created missing row: day:94, cluster:8, topic:flight, boeing
Created missing row: day:94, cluster:22, topic:autism, program
Created missing row: day:94, cluster:43, topic:mcarthur, bruce
Created missing row: day:95, cluster:2, topic:weather, winter
Created missing row: day:95, cluster:22, topic:autism, p

Created missing row: day:110, cluster:12, topic:bombardier, service
Created missing row: day:110, cluster:15, topic:boycott, vehicle
Created missing row: day:110, cluster:16, topic:sidewalk, project
Created missing row: day:110, cluster:18, topic:digital, announce
Created missing row: day:110, cluster:22, topic:autism, program
Created missing row: day:110, cluster:29, topic:award, community
Created missing row: day:110, cluster:31, topic:artist, indigenous
Created missing row: day:110, cluster:33, topic:china, huawei
Created missing row: day:110, cluster:35, topic:black, pride
Created missing row: day:110, cluster:36, topic:meghan, markle
Created missing row: day:110, cluster:40, topic:condo, building
Created missing row: day:110, cluster:42, topic:police, woman
Created missing row: day:110, cluster:47, topic:commissioner, police
Created missing row: day:110, cluster:48, topic:trudeau, lavalin
Created missing row: day:110, cluster:49, topic:student, education


In [119]:
dfChart2 = dfChart.append(missing_rows, sort=True)
dfChart2 = dfChart2.sort_values(['day','cluster'], ascending=[True,True])
dfChart2 = dfChart2.reset_index(drop=True)
dfChart2[dfChart2.day == 110].head()

Unnamed: 0,cluster,count,date,day,topic,x,y
4450,0,2,2019-04-20,110,"announce, result",13.079144,35.752241
4451,1,0,2019-04-20,110,"refugee, asylum",1.800527,35.000123
4452,2,0,2019-04-20,110,"weather, winter",8.189021,0.808575
4453,3,7,2019-04-20,110,"cannabis, store",6.968602,45.099509
4454,4,1,2019-04-20,110,"woman, survivor",25.714853,28.595907


## TODO: Project clustering to 2D using PCA

In [9]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=2)
#principalComponents = pca.fit_transform(x)
#principalDf = pd.DataFrame(data = principalComponents
#             , columns = ['principal component 1', 'principal component 2'])

## Plot

In [120]:
import plotly_express as px
px.scatter(dfChart2, x="x", y="y", animation_frame="day", animation_group="topic",
           size="count", color="topic", hover_name="topic",
           size_max=150, range_x=[-5,55], range_y=[-5,55])