## Topic T-SNE map

In [1]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import os 
import json
import time 
import pandas as pd 
import numpy as np

from bokeh.io import output_notebook
import matplotlib.colors as mcolors
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label, ColumnDataSource, Text, Legend
#from bokeh.models import LegendItem
#from bokeh.io import export_svgs


In [2]:
folder = "/home/linyu.linyu/health.research/health/data/cls_output/topic_out/"
topic_scores = json.loads( open( folder + "/topic40_output.json", "r").read() )
topic_names = pd.read_csv( folder + "/topic_models.csv" )
topic_scores.keys()

dict_keys(['word_dic', 'doc2topic', 'topic_key_words', 'topic_word_weights', 'rel_topic_preference_score', 'topic2words', 'topic_preference_score'])

In [3]:
topic_names['Topic_idx'] = topic_names['Topic_prevalence'].apply( lambda x: int( x.strip().split(" ")[1] ) )
topic_names['Aggregated_meta_topic'] = topic_names['Aggregated_meta_topic'].apply( lambda x: x.strip())
topic_names.head(40)

Unnamed: 0.1,Unnamed: 0,Aggregated_meta_topic,Aggregated_topic,Topic_name,Top_3_stemmed_keywords,Topic_prevalence,Topic_idx,Prevalence_score
0,0,Natural disasters,Natural disaster,Natural disaster,"disaster, natural, review",topic 23,23,1.78154
1,1,Natural disasters,Heat waves,Heat waves,"heat, wave, temperature",topic 2,2,0.781942
2,2,Natural disasters,Earthquake,Nepal and Aquila earthquake,"earthquake, nepal, aquila",topic 10,10,1.665045
3,3,Natural disasters,Nuclear disaster,Fukushima nuclear disaster,"fukushima, evacuation, nuclear",topic 15,15,0.897653
4,4,Natural disasters,Tsunami,Japan tsunami,"tsunami, Japan, east",topic 8,8,1.396157
5,5,Natural disasters,Dust storms,Dust storms,"dust, storm, admission",topic 17,17,0.732289
6,6,Natural disasters,Tornado,Joplin tornado,"tornado, outbreak, joplin",topic 24,24,0.48381
7,7,Natural disasters,Extreme weather,Extreme weather,"extreme, weather, climate",topic 32,32,0.888863
8,8,Natural disasters,Flood,Flood,"flood, dysentery, flooding",topic 6,6,1.119432
9,9,Natural disasters,Hurricane,Hurricane Sandy,"sandy, hurricane, new",topic 26,26,0.769397


In [5]:
topic_names['Aggregated_meta_topic'].value_counts()

Health risks and impacts    15
Coping and responses        13
Natural disasters           12
Name: Aggregated_meta_topic, dtype: int64

In [6]:
topic_idx2name = dict()
topic_idx2metaname = dict()
for i in range(len(topic_names)):
    topic_idx2name[topic_names['Topic_idx'].iloc[i]] = topic_names['Topic_name'].iloc[i].strip()
    topic_idx2metaname[topic_names['Topic_idx'].iloc[i]] = topic_names['Aggregated_meta_topic'].iloc[i].strip()
    
doc2topic = np.array( topic_scores['doc2topic'] )

doc_topic_names = list()
for i in range(len(doc2topic)):
    topic_idx = np.argmax(doc2topic[i,:])
    doc_topic_names.append( topic_idx2name[topic_idx] )

X = doc2topic
Y = doc_topic_names

start = time.time()
tsne_model = TSNE(
    n_components=2, 
    verbose=1, 
    random_state=0, 
    #angle=.99, 
    init='pca')

result_2D = tsne_model.fit_transform(X)
print("tsne process : ", time.time()-start)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 6106 samples in 0.051s...
[t-SNE] Computed neighbors for 6106 samples in 2.917s...
[t-SNE] Computed conditional probabilities for sample 1000 / 6106
[t-SNE] Computed conditional probabilities for sample 2000 / 6106
[t-SNE] Computed conditional probabilities for sample 3000 / 6106
[t-SNE] Computed conditional probabilities for sample 4000 / 6106
[t-SNE] Computed conditional probabilities for sample 5000 / 6106
[t-SNE] Computed conditional probabilities for sample 6000 / 6106
[t-SNE] Computed conditional probabilities for sample 6106 / 6106
[t-SNE] Mean sigma: 0.040994
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.772247
[t-SNE] KL divergence after 1000 iterations: 1.087993
tsne process :  27.929288387298584


In [7]:

output_notebook()
n_colors = 10 #8 # 40
topic_num = np.argmax(doc2topic, axis=1) 
topic_num_ = topic_num % n_colors

mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(
    title="t-SNE Clustering of {} Topics".format(40),       
    plot_width=900, 
    plot_height=700
    )

plot.scatter(
    x=result_2D[:,0], 
    y=result_2D[:,1], 
    color=mycolors[topic_num_],
    alpha=0.5,
    )

topic_xs = list()
topic_ys = list()
topic_texts = list()
topic_colors = list()
## add topic names
for topic_i in range(40):
    related_doc = topic_num == topic_i
    topic_data = result_2D[related_doc]
    topic_x = np.mean(topic_data[:,0])
    topic_y = np.mean(topic_data[:,1])
    
    ###topic_name = doc_topic_names[topic_i].strip() #[:6].strip()
    topic_name = topic_idx2name[topic_i].strip()
    topic_xs.append(topic_x)
    topic_ys.append(topic_y)
    topic_texts.append(topic_name.lower())
    topic_colors.append( mycolors[topic_i%n_colors] )
    #plot.text( topic_x, topic_y, str(topic_name), color=mycolors[topic_i%n_colors],
    #    fontdict={'weight': 'bold', 'size': 9}
    #    )

source = ColumnDataSource(dict(x=topic_xs, y=topic_ys, text=topic_texts, color=topic_colors))
glyph = Text(
    x="x", 
    y="y", 
    text="text", 
    text_align = 'right',
    text_baseline = 'top',
    text_font_size = {'value': '12px'},
    
    #text_color="color",
    text_color="#000000",
    #text_font_style="bold",
    )

plot.add_glyph(source, glyph)

    
show(plot)

## 按照 大topic 看颜色，然后调整下 tsne 的超参数

In [11]:

def show_tsn(result_2D, 
             topic_colors,
             topic_labels,
            Aggregated_meta_topic_map,
            ):
    output_notebook()
    plot2 = figure(
        #title="t-SNE Meta topic Clustering of {} Topics".format(40),  
        title="t-SNE Clustering of 3 Meta topics",
        plot_width=900, 
        plot_height=700
        )
    mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])

    for topic_text in Aggregated_meta_topic_map.keys():
#         if topic_text != "Coping and responses" : #"Health risks & impacts":
#             continue
            
        color_index = Aggregated_meta_topic_map[topic_text]
        picked_data = topic_colors == color_index
        topic_colors_ = topic_colors[picked_data]
        print(topic_text, color_index, )
        
        source = ColumnDataSource(
            dict(x=result_2D[picked_data,0], 
                 y=result_2D[picked_data,1], 
                 #text=topic_texts, 
                 color=topic_colors[picked_data], 
                 label=topic_labels[picked_data]
                )
            )
        plot2.scatter(
            x="x", 
            y="y", 
            source=source,
            color=mycolors[topic_colors_],
            legend=topic_text,
            )
    show(plot2)


In [14]:
"""
tsne_model = TSNE(
    n_components=2, 
    verbose=1, 
    random_state=0, 
    #perplexity=30, ## default value = 30  # not sensitive
    #angle=.2,   ## balance between time and error
    init='pca' ,
    #learning_rate=10, ## default 1000, 100~1000
    )
result_2D = tsne_model.fit_transform(X)
"""

n_colors = 3 #8 # 40

Aggregated_meta_topic_map = {"Natural disasters": 0, "Health risks and impacts": 1, "Coping and responses": 3}
## blue/ orange/ red

topic_num = np.argmax(doc2topic, axis=1) 
topic_colors = list()
topic_labels = list()
for i in range(len(doc2topic)):
    topic_idx = topic_num[i]
    meta_topic = topic_idx2metaname[topic_idx]
    meta_topic_color = Aggregated_meta_topic_map[meta_topic]
    #print( topic_idx2metaname[topic_idx], "\t", topic_idx2name[topic_idx], meta_topic_color )
    
    topic_colors.append(meta_topic_color)
    topic_labels.append(meta_topic.lower().strip())
topic_colors = np.array(topic_colors) 
topic_labels = np.array(topic_labels)
#
print("finish meta topic coloring")
  

finish meta topic coloring


In [12]:
topic_idx2metaname

{0: 'Health risks and impacts',
 1: 'Coping and responses',
 2: 'Natural disasters',
 3: 'Coping and responses',
 4: 'Coping and responses',
 5: 'Natural disasters',
 6: 'Natural disasters',
 7: 'Coping and responses',
 8: 'Natural disasters',
 9: 'Coping and responses',
 10: 'Natural disasters',
 11: 'Coping and responses',
 12: 'Health risks and impacts',
 13: 'Health risks and impacts',
 14: 'Health risks and impacts',
 15: 'Natural disasters',
 16: 'Health risks and impacts',
 17: 'Natural disasters',
 18: 'Coping and responses',
 19: 'Coping and responses',
 20: 'Natural disasters',
 21: 'Health risks and impacts',
 22: 'Health risks and impacts',
 23: 'Natural disasters',
 24: 'Natural disasters',
 25: 'Health risks and impacts',
 26: 'Natural disasters',
 27: 'Health risks and impacts',
 28: 'Health risks and impacts',
 29: 'Coping and responses',
 30: 'Health risks and impacts',
 31: 'Coping and responses',
 32: 'Natural disasters',
 33: 'Coping and responses',
 34: 'Health ris

In [13]:
topic_idx2name

{0: 'PTSD',
 1: 'Mental health services',
 2: 'Heat waves',
 3: 'Hospital and patients',
 4: 'Parent-child',
 5: 'Hurricane Maria',
 6: 'Flood',
 7: 'Posttraumatic growth',
 8: 'Japan tsunami',
 9: 'Resilience',
 10: 'Nepal and Aquila earthquake',
 11: 'Social support',
 12: 'Suicide',
 13: 'Sleep disturbance',
 14: 'Injury and fracture',
 15: 'Fukushima nuclear disaster',
 16: 'Birth and pregnancy',
 17: 'Dust storms',
 18: 'Posttraumatic growth',
 19: 'Visits to healthcare facilities',
 20: 'Hurricane Katrina',
 21: 'Psychological distress',
 22: 'Anxiety',
 23: 'Natural disaster',
 24: 'Joplin tornado',
 25: 'PTSD',
 26: 'Hurricane Sandy',
 27: 'Chronic dieseases',
 28: 'Cholera and leptospirosis',
 29: 'Adolescent',
 30: 'Depression',
 31: 'Group',
 32: 'Extreme weather',
 33: 'Religious coping',
 34: 'Quality of life',
 35: 'Survivor',
 36: 'Infectious diseases',
 37: 'Worker',
 38: 'Mortality',
 39: 'Pain'}

In [15]:
                
show_tsn(result_2D, topic_colors, topic_labels,  Aggregated_meta_topic_map)

Natural disasters 0
Coping and responses 3
Health risks and impacts 1


In [39]:
mcolors.TABLEAU_COLORS.items()

odict_items([('tab:blue', '#1f77b4'), ('tab:orange', '#ff7f0e'), ('tab:green', '#2ca02c'), ('tab:red', '#d62728'), ('tab:purple', '#9467bd'), ('tab:brown', '#8c564b'), ('tab:pink', '#e377c2'), ('tab:gray', '#7f7f7f'), ('tab:olive', '#bcbd22'), ('tab:cyan', '#17becf')])

## Appendix: color names

In [56]:
cnames = {
'aliceblue':            '#F0F8FF',
'antiquewhite':         '#FAEBD7',
'aqua':                 '#00FFFF',
'aquamarine':           '#7FFFD4',
'azure':                '#F0FFFF',
'beige':                '#F5F5DC',
'bisque':               '#FFE4C4',
'black':                '#000000',
'blanchedalmond':       '#FFEBCD',
'blue':                 '#0000FF',
'blueviolet':           '#8A2BE2',
'brown':                '#A52A2A',
'burlywood':            '#DEB887',
'cadetblue':            '#5F9EA0',
'chartreuse':           '#7FFF00',
'chocolate':            '#D2691E',
'coral':                '#FF7F50',
'cornflowerblue':       '#6495ED',
'cornsilk':             '#FFF8DC',
'crimson':              '#DC143C',
'cyan':                 '#00FFFF',
'darkblue':             '#00008B',
'darkcyan':             '#008B8B',
'darkgoldenrod':        '#B8860B',
'darkgray':             '#A9A9A9',
'darkgreen':            '#006400',
'darkkhaki':            '#BDB76B',
'darkmagenta':          '#8B008B',
'darkolivegreen':       '#556B2F',
'darkorange':           '#FF8C00',
'darkorchid':           '#9932CC',
'darkred':              '#8B0000',
'darksalmon':           '#E9967A',
'darkseagreen':         '#8FBC8F',
'darkslateblue':        '#483D8B',
'darkslategray':        '#2F4F4F',
'darkturquoise':        '#00CED1',
'darkviolet':           '#9400D3',
'deeppink':             '#FF1493',
'deepskyblue':          '#00BFFF',
'dimgray':              '#696969',
'dodgerblue':           '#1E90FF',
'firebrick':            '#B22222',
'floralwhite':          '#FFFAF0',
'forestgreen':          '#228B22',
'fuchsia':              '#FF00FF',
'gainsboro':            '#DCDCDC',
'ghostwhite':           '#F8F8FF',
'gold':                 '#FFD700',
'goldenrod':            '#DAA520',
'gray':                 '#808080',
'green':                '#008000',
'greenyellow':          '#ADFF2F',
'honeydew':             '#F0FFF0',
'hotpink':              '#FF69B4',
'indianred':            '#CD5C5C',
'indigo':               '#4B0082',
'ivory':                '#FFFFF0',
'khaki':                '#F0E68C',
'lavender':             '#E6E6FA',
'lavenderblush':        '#FFF0F5',
'lawngreen':            '#7CFC00',
'lemonchiffon':         '#FFFACD',
'lightblue':            '#ADD8E6',
'lightcoral':           '#F08080',
'lightcyan':            '#E0FFFF',
'lightgoldenrodyellow': '#FAFAD2',
'lightgreen':           '#90EE90',
'lightgray':            '#D3D3D3',
'lightpink':            '#FFB6C1',
'lightsalmon':          '#FFA07A',
'lightseagreen':        '#20B2AA',
'lightskyblue':         '#87CEFA',
'lightslategray':       '#778899',
'lightsteelblue':       '#B0C4DE',
'lightyellow':          '#FFFFE0',
'lime':                 '#00FF00',
'limegreen':            '#32CD32',
'linen':                '#FAF0E6',
'magenta':              '#FF00FF',
'maroon':               '#800000',
'mediumaquamarine':     '#66CDAA',
'mediumblue':           '#0000CD',
'mediumorchid':         '#BA55D3',
'mediumpurple':         '#9370DB',
'mediumseagreen':       '#3CB371',
'mediumslateblue':      '#7B68EE',
'mediumspringgreen':    '#00FA9A',
'mediumturquoise':      '#48D1CC',
'mediumvioletred':      '#C71585',
'midnightblue':         '#191970',
'mintcream':            '#F5FFFA',
'mistyrose':            '#FFE4E1',
'moccasin':             '#FFE4B5',
'navajowhite':          '#FFDEAD',
'navy':                 '#000080',
'oldlace':              '#FDF5E6',
'olive':                '#808000',
'olivedrab':            '#6B8E23',
'orange':               '#FFA500',
'orangered':            '#FF4500',
'orchid':               '#DA70D6',
'palegoldenrod':        '#EEE8AA',
'palegreen':            '#98FB98',
'paleturquoise':        '#AFEEEE',
'palevioletred':        '#DB7093',
'papayawhip':           '#FFEFD5',
'peachpuff':            '#FFDAB9',
'peru':                 '#CD853F',
'pink':                 '#FFC0CB',
'plum':                 '#DDA0DD',
'powderblue':           '#B0E0E6',
'purple':               '#800080',
'red':                  '#FF0000',
'rosybrown':            '#BC8F8F',
'royalblue':            '#4169E1',
'saddlebrown':          '#8B4513',
'salmon':               '#FA8072',
'sandybrown':           '#FAA460',
'seagreen':             '#2E8B57',
'seashell':             '#FFF5EE',
'sienna':               '#A0522D',
'silver':               '#C0C0C0',
'skyblue':              '#87CEEB',
'slateblue':            '#6A5ACD',
'slategray':            '#708090',
'snow':                 '#FFFAFA',
'springgreen':          '#00FF7F',
'steelblue':            '#4682B4',
'tan':                  '#D2B48C',
'teal':                 '#008080',
'thistle':              '#D8BFD8',
'tomato':               '#FF6347',
'turquoise':            '#40E0D0',
'violet':               '#EE82EE',
'wheat':                '#F5DEB3',
'white':                '#FFFFFF',
'whitesmoke':           '#F5F5F5',
'yellow':               '#FFFF00',
'yellowgreen':          '#9ACD32'}

In [40]:
maxv = np.max(doc2topic, axis=1) 

In [44]:
np.mean(maxv)

0.11695003898608769

In [42]:
maxv


array([ 0.18917275,  0.19892231,  0.0880075 , ...,  0.20373732,
        0.13164817,  0.06728853])