#  GTA News 50 Clusters Chart

## Load 50 cluster dataframe

In [1]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from collections import Counter

# directories
dir_path = os.getcwd()
#print('Working dir: ' + dir_path)

local_path = dir_path + '\\..\\gta-news\\doc2vec\data\\'
df = pd.read_pickle(local_path+'backup'+'-gta.50'+'.pickle')

In [5]:
df.head()

Unnamed: 0,date,title,url,cluster
0,2019-04-20,"""this is why we can't have nice things in nyc,...",http://dagblog.com/reader-blogs/why-we-cant-ha...,20
1,2019-04-20,rural book borrowing in peril as libraries sla...,http://easternontarionetwork.com/2019/04/20/ru...,12
2,2019-04-20,"""discussing canada's new us-focused cannabis e...",http://www.benzinga.com/markets/cannabis/19/04...,5
3,2019-04-20,serea restaurant slated to open in hotel del c...,http://www.coronadonewsca.com/news/coronado_ho...,31
4,2019-04-20,federal trial of vernon man accused of abducti...,http://www.courant.com/news/connecticut/hc-new...,46


## Get cluster descriptions from titles

In [18]:
import os
import sys
module_path = os.path.abspath(os.path.join('../gta-news/doc2vec'))
if module_path not in sys.path:
    sys.path.append(module_path)

import d2v_utils
skip_terms =['toronto','canada','canadian','ontario']
cluster_descr = []
clusters = df.groupby(['cluster'])['title']
for cluster, titles in clusters:
    #print("\nCluster: ", cluster)
    filtered_words = []
    for title in titles:
        t = title[0:-4]
        #print(">>>", t)
        tokens = d2v_utils.prepare_text_for_lda(t)
        tokens = [word for word in tokens if word not in skip_terms and not word.isdigit()]
        #print("  >", tokens)
        filtered_words = filtered_words + tokens
    count = Counter(filtered_words)
    current_clust_descr = count.most_common()[:10] 
    cluster_descr.append(current_clust_descr)

In [68]:
print(cluster_descr[0][0:5])
print(cluster_descr[1][0:5])
print(cluster_descr[2][0:5])
print(cluster_descr[3][0:5])
print(cluster_descr[4][0:5])
print(cluster_descr[5][0:5])

[('announce', 132), ('cannabis', 45), ('board', 29), ('update', 23), ('technology', 23)]
[('refugee', 52), ('asylum', 47), ('snowden', 46), ('shelter', 44), ('grant', 30)]
[('announce', 68), ('project', 39), ('resource', 36), ('result', 27), ('mining', 21)]
[('weather', 23), ('winter', 22), ('storm', 16), ('school', 11), ('continue', 10)]
[('court', 42), ('decision', 18), ('appeal', 17), ('supreme', 16), ('government', 15)]
[('cannabis', 156), ('company', 37), ('store', 24), ('rogen', 22), ('growth', 20)]


In [129]:
decriptions = []
for row in cluster_descr:
    d=[]
    for token in row:
        d.append(token[0])
    decriptions.append(d)

data = df
description = []
for i, row in df.iterrows():
    description.append(decriptions[row['cluster']])

data['description'] = description

## Prepare the dataset

In [130]:
df.head()

Unnamed: 0,date,title,url,cluster,description
0,2019-04-20,"""this is why we can't have nice things in nyc,...",http://dagblog.com/reader-blogs/why-we-cant-ha...,20,"[transit, sidewalk, project, subway, plan, sma..."
1,2019-04-20,rural book borrowing in peril as libraries sla...,http://easternontarionetwork.com/2019/04/20/ru...,12,"[health, government, budget, funding, public, ..."
2,2019-04-20,"""discussing canada's new us-focused cannabis e...",http://www.benzinga.com/markets/cannabis/19/04...,5,"[cannabis, company, store, rogen, growth, mari..."
3,2019-04-20,serea restaurant slated to open in hotel del c...,http://www.coronadonewsca.com/news/coronado_ho...,31,"[restaurant, store, first, announce, location,..."
4,2019-04-20,federal trial of vernon man accused of abducti...,http://www.courant.com/news/connecticut/hc-new...,46,"[police, charge, arrest, assault, woman, suspe..."


In [221]:
# need dfAggr:
# date, cluster, count, description, x, y
dfAggr = df[['date','cluster','description']].groupby(['date','cluster'])\
    .first().sort_values(['date','cluster']).reset_index()

dfCount = df.groupby(['date','cluster'])['cluster']\
    .agg('count').to_frame('count')\
    .sort_values(['date','cluster']).reset_index()['count']

dfAggr['count'] = dfCount

# create topic string
topics = []
for i, row in dfAggr.iterrows():
    topics.append(', '.join(row['description'][0:2]))
dfAggr['topic'] = topics


# temporary RANDOM
import random
import numpy as np
random.seed(11)
dfAggr['x'] = 0.0
dfAggr['y'] = 0.0
sampl_x = np.random.uniform(low=0.0, high=50.0, size=(50,))
sampl_y = np.random.uniform(low=0.0, high=50.0, size=(50,))
for i, row in dfAggr.iterrows():
    dfAggr.at[i,'x'] = sampl_x[row.cluster]
    dfAggr.at[i,'y'] = sampl_y[row.cluster]

dfAggr.head(10)

Unnamed: 0,date,cluster,description,count,topic,x,y
0,2019-01-21,0,"[announce, cannabis, board, update, technology...",5,"announce, cannabis",40.770015,6.339483
1,2019-01-21,1,"[refugee, asylum, snowden, shelter, grant, wom...",2,"refugee, asylum",1.262558,35.072014
2,2019-01-21,2,"[announce, project, resource, result, mining, ...",4,"announce, project",9.427444,48.594354
3,2019-01-21,3,"[weather, winter, storm, school, continue, swe...",4,"weather, winter",0.647707,19.914938
4,2019-01-21,4,"[court, decision, appeal, supreme, government,...",5,"court, decision",6.005125,5.644537
5,2019-01-21,5,"[cannabis, company, store, rogen, growth, mari...",4,"cannabis, company",22.003997,25.330922
6,2019-01-21,6,"[woman, survivor, family, cancer, holocaust, f...",2,"woman, survivor",28.415228,25.735026
7,2019-01-21,8,"[research, company, university, better, drivin...",2,"research, company",1.490165,25.98081
8,2019-01-21,9,"[study, award, could, researcher, science, sci...",3,"study, award",28.691782,46.102821
9,2019-01-21,10,"[crash, ethiopian, plane, airline, family, vic...",2,"crash, ethiopian",46.349804,18.963932


## Cleaning dataset

In [222]:
dfChart = dfAggr.sort_values(['date','count'], ascending=[True,False]).groupby('date').head(20)
dfChart = dfChart[dfChart['count'] > 2]
dfChart = dfChart.reset_index()

#date not working in chart
new_year_day = pd.Timestamp(year=2019, month=1, day=1)
dfChart['day'] = 0
for i, row in dfChart.iterrows():
    dfChart.at[i,'day'] = (row['date'] - new_year_day).days + 1

dfChart.head()

Unnamed: 0,index,date,cluster,description,count,topic,x,y,day
0,27,2019-01-21,30,"[trudeau, liberal, carbon, climate, government...",8,"trudeau, liberal",33.169944,49.66042,21
1,40,2019-01-21,45,"[founder, deciem, brandon, truaxe, reporter, p...",8,"founder, deciem",28.522375,3.542355,21
2,13,2019-01-21,14,"[announce, business, launch, digital, award, m...",7,"announce, business",35.54984,12.777205,21
3,24,2019-01-21,27,"[barrick, newmont, dollar, company, investor, ...",7,"barrick, newmont",40.561493,28.097267,21
4,11,2019-01-21,12,"[health, government, budget, funding, public, ...",6,"health, government",6.301551,45.360687,21


## TODO: Project clustering to 2D using PCA

In [None]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=2)
#principalComponents = pca.fit_transform(x)
#principalDf = pd.DataFrame(data = principalComponents
#             , columns = ['principal component 1', 'principal component 2'])

## Plot

In [223]:
import plotly_express as px
px.scatter(dfChart, x="x", y="y", animation_frame="day", animation_group="topic",
           size="count", color="topic", hover_name="topic",
           size_max=150, range_x=[-5,65], range_y=[-5,65])