## Visualizing the clusters by student

Visualizations of the 100-word and 50-word segment deviation vector clusters  
Using the hierarchical agglomerative clustering method  

S435 Final Project: Student Science identity  
Data: Interviews with 15 7th grade students from EcoXPT evaluation  

Analysis notebooks are here:  
/Volumes/GoogleDrive/My Drive/Spring 2019/S435/GitHub/final-project-EileenMcGivney/Analysis-50words-McGivney.ipynb  
/Volumes/GoogleDrive/My Drive/Spring 2019/S435/GitHub/final-project-EileenMcGivney/Analysis-100words-McGivney.ipynb  



In [512]:
## Setup
import os
import re
import glob
import math
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
#nltk.download('punkt')
from collections import Counter

from collections import defaultdict
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import NMF

%cd '/Volumes/GoogleDrive/My Drive/Spring 2019/S435/GitHub/final-project-EileenMcGivney'

/Volumes/GoogleDrive/My Drive/Spring 2019/S435/GitHub/final-project-EileenMcGivney


In [513]:
## Load data

# Deviation vectors:
dfh = pd.read_csv('./dataframes/segments_hundred_deviance.csv', index_col=0)
dff = pd.read_csv('./dataframes/segments_fifty_deviance.csv', index_col=0)
dfa = pd.read_csv('./dataframes/transcripts_all.csv', index_col=0)


# cleaned segments:
# 100 words
import pickle
with open('./cleandocs/segs_hundred.txt', 'rb') as fp:
    segs_hundred = pickle.load(fp)
    

# 50 words:
with open('./cleandocs/segs_fifty.txt', 'rb') as fp:
    segs_fifty = pickle.load(fp)
    
### cleaned transcripts:

docs = []
for i in range (15):
    with open('./cleandocs/doc'+str(i)+'.txt', 'rb') as fp:
        docs.append(pickle.load(fp))

### Clustering the data

In [590]:
## Hierarchical Agglomerative Clustering: 
# 50-word segments:
ward_f = AgglomerativeClustering(n_clusters=7, linkage='ward').fit(dff.values)
label_f = ward_f.labels_

print("Number of points: %i" % label_f.size)

clf_f = NearestCentroid()
clf_f.fit(dff.values, label_f)

print(clf_f.centroids_.shape)

Number of points: 194
(7, 992)


In [515]:
## Hierarchical Agglomerative Clustering: 
# 100-word segments:
ward_h = AgglomerativeClustering(n_clusters=7, linkage='ward').fit(dfh.values)
label_h = ward_h.labels_

print("Number of points: %i" % label_h.size)

clf_h = NearestCentroid()
clf_h.fit(dfh.values, label_h)

print(clf_h.centroids_.shape)

Number of points: 128
(7, 992)


In [516]:
## Hierarchical Agglomerative Clustering: 
# Full transcripts:
ward_a = AgglomerativeClustering(n_clusters=4, linkage='ward').fit(dfa.values)
label_a = ward_a.labels_

print("Number of points: %i" % label_a.size)

clf_a = NearestCentroid()
clf_a.fit(dfa.values, label_a)

print(clf_a.centroids_.shape)

Number of points: 15
(4, 992)


In [517]:
#print the top 10 words for each cluster centroid
# make a function to print the words:
def visualize_clusters(df, n_clusters, centroids, n_words=10, printed=True):   
    # try to get the most informative words of each cluster
    words = {}
    vocabulary = df.columns
    for n in range(n_clusters):
        words[n] = []
        if printed: print('CLUSTER ' + str(n+1) + ': ', end='')
        arr = centroids[n]
        indices = arr.argsort()[-n_words:]
        for i in indices:
            if printed: print(vocabulary[i], end=', '),
            words[n].append(vocabulary[i])
        print('')
    return words

In [589]:
# Top Ten words for each cluster: 50-word segments and 100-word segments:
top_words_f = visualize_clusters(dff, clf_f.centroids_.shape[0], clf_f.centroids_)

CLUSTER 1: lab, could, fun, saw, would, make, go, put, they, see, 
CLUSTER 2: things, someone, certain, way, part, much, my, didn, science, know, 
CLUSTER 3: happens, instead, use, able, look, interesting, living, things, different, think, 
CLUSTER 4: lead, people, bees, went, figure, time, always, problem, go, different, 
CLUSTER 5: general, finding, around, things, dying, cause, look, died, guess, fish, 


In [519]:
top_words_h = visualize_clusters(dfh, clf_h.centroids_.shape[0], clf_h.centroids_)

CLUSTER 1: solving, good, tanks, fertilizer, get, work, something, take, figure, know, 
CLUSTER 2: fun, way, learned, pond, affected, parts, inside, know, science, things, 
CLUSTER 3: test, sort, information, everything, look, different, able, things, think, living, 
CLUSTER 4: us, map, topic, around, would, idea, year, teacher, see, put, 
CLUSTER 5: beyond, cause, general, right, finding, dying, look, guess, died, fish, 
CLUSTER 6: go, people, went, little, thought, always, interested, time, bees, definitely, 
CLUSTER 7: experiments, see, go, could, solve, happening, problems, outside, world, they, 


In [520]:
#all transcripts
top_words_a = visualize_clusters(dfa, clf_a.centroids_.shape[0], clf_a.centroids_)

CLUSTER 1: something, see, know, science, would, could, different, things, they, think, 
CLUSTER 2: something, look, never, different, fish, world, around, think, experiments, they, 
CLUSTER 3: things, something, people, instead, collecting, experiments, figure, could, data, know, 
CLUSTER 4: think, thought, didn, good, know, guess, my, try, science, they, 


### Visualizing the clusters by each transcript

**100-word segments**

In [568]:
### Colors from Bokeh:
from bokeh.palettes import Category20
colors_h= Category20[ward_h.n_clusters]
print(colors_h)

['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', '#d62728']


In [569]:
# print the top ten words of each cluster
# in a different color using the HTML function 
from IPython.core.display import HTML

html_text= " "
for i in range(ward_h.n_clusters):
    words = ' '.join(top_words_h[i])
    text = "<p>Cluster "+str(i)+": <font color="+colors_h[i]+">"+words+"</font></p>"
    html_text += text
HTML(html_text)

In [523]:
# indices: 128 segments
indices_h = [i for i in range(128)]
#print(indices)

In [524]:
# List of the segments
list_of_segs_h = []
for doc in segs_hundred:
    for seg in doc:
        list_of_segs_h.append(seg)
len(list_of_segs_h)
#print(list_of_segs[0])

128

In [525]:
# 7) retrieve the list of labels assigned to each chunk
labels_h = label_h
set(labels_h)

{0, 1, 2, 3, 4, 5, 6}

In [526]:
# assign the segments to their documents
doc_id_h = []
current_doc = 0

for i,doc in enumerate(segs_hundred):
    for seg in doc:
        doc_id_h.append(i)
print(len(doc_id_h))


128


In [596]:
# add a name/scientist_h for each student
name_h = []
scientist_h= []
for i in doc_id_h:
    if i == 0:
        name_h.append('amina')
        scientist_h.append('no')
    elif i == 1:
        name_h.append('ben')
        scientist_h.append('no')
    elif i == 2:
        name_h.append('daniel')
        scientist_h.append('yes')
    elif i == 3:
        name_h.append('emma')
        scientist_h.append('maybe')
    elif i == 4:
        name_h.append('james')
        scientist_h.append('maybe')
    elif i == 5:
        name_h.append('jenny')
        scientist_h.append('no')
    elif i == 6:
        name_h.append('kyle')
        scientist_h.append('yes')
    elif i == 7:
        name_h.append('leila')
        scientist_h.append('no')
    elif i == 8:
        name_h.append('maya')
        scientist_h.append('maybe')        
    elif i == 9:
        name_h.append('michelle')
        scientist_h.append('no')
    elif i == 10:
        name_h.append('mike')
        scientist_h.append('no')
    elif i == 11:
        name_h.append('nicole')
        scientist_h.append('yes')
    elif i == 12:
        name_h.append('samuel')
        scientist_h.append('maybe')
    elif i == 13:
        name_h.append('sasha')
        scientist_h.append('yes')
    elif i == 14:
        name_h.append('simon')
        scientist_h.append('no')

# make numeric list of scientist so can make colors for it
scicolor_h = []      
for i in scientist_h:
    if i == 'yes':
        scicolor_h.append(1)
    elif i == 'no':
        scicolor_h.append(3)
    elif i == 'maybe':
        scicolor_h.append(2)
        

In [597]:
# assign the corresponding color for scientist/ not scientist

colors_h= Category10[4]

print(set(colors_h))
palette_h_sci = []
for sci in scicolor_h:
    color = colors_h[sci]
    palette_h_sci.append(color)
print(set(palette_h))
print(scientist_h[0:11])
print(palette_h[0:11])
print(len(palette_h))

{'#2ca02c', '#1f77b4', '#ff7f0e', '#d62728'}
{'#2ca02c', '#d62728', '#ff7f0e'}
['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no']
['#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c']
128


In [570]:
# assign corresponding color for each cluster
colors_h= Category20[ward_h.n_clusters]
print(colors_h)

palette_h_clu = []
for label in labels_h:
    color = colors_h[label]
    palette_h_clu.append(color)
print(set(palette_h_clu))
print(labels_h[0:11])
print(palette_h_clu[0:11])
print(len(palette_h_clu))

['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', '#d62728']
{'#aec7e8', '#d62728', '#1f77b4', '#98df8a', '#ff7f0e', '#ffbb78', '#2ca02c'}
[1 1 1 1 1 1 1 1 1 1 1]
['#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8']
128


In [572]:
# assign corresponding color for each individual student


colors_h= Category20[15]

print(set(colors_f))
palette_h_stu = []
for doc_id in doc_id_h:
    color = colors_h[doc_id]
    palette_h_stu.append(color)
print(set(palette_h_stu))
print(doc_id_h[0:11])
print(palette_h_stu[0:11])
print(len(palette_h_stu))

{'#2ca02c', '#1f77b4', '#ff7f0e', '#d62728'}
{'#aec7e8', '#ff9896', '#c5b0d5', '#d62728', '#8c564b', '#f7b6d2', '#1f77b4', '#e377c2', '#9467bd', '#ff7f0e', '#ffbb78', '#2ca02c', '#c49c94', '#7f7f7f'}
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4']
128


In [529]:
# Double check all the lists are right
print(len(indices_h))
print(len(list_of_segs_h))
print(len(labels_h))
print(len(doc_id_h))
print(len(palette_h))
print(len(name_h))
print(len(scientist_h))

128
128
128
128
128
128
128


In [598]:
#create a dictionary using the lists from above
master_h = {'indices': indices_h,
          'seg': list_of_segs_h, 
          'cluster': labels_h,
          'document': doc_id_h, 
          'palette_sci': palette_h_sci, 
            'palette_stu': palette_h_stu,
            'palette_clu': palette_h_clu,
           'name' : name_h,
           'scientist': scientist_h}

In [599]:

# convert the dictionary into a pandas dataframe

master_dfh = pd.DataFrame(master_h)


In [577]:
# #create the plot using circles 
# from bokeh.plotting import ColumnDataSource, figure, show
# from bokeh.io import output_notebook, curdoc, output_file
# from bokeh.models import HoverTool, Select, Slider
# from bokeh.layouts import row, column

# #create a figure:
# p = figure(plot_width=400, plot_height=400)

# # Create a ColumnDataSource from df: source
# source = ColumnDataSource(master_dfh)

# # Add circle glyphs to the figure p
# p.circle('indices', 'cluster', size=8, source=source)

# #show the plot
# output_file('clusters_h.html')
# show(p)


In [576]:
#add colors for each cluster

#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dfh)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, color = 'palette_clu', source=source)

# Specify the name of the output file and show the result
output_file('colorclusters_h.html')
show(p)

In [579]:
#add colors for each students

#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dfh)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, color = 'palette_stu', source=source)

# Specify the name of the output file and show the result
output_file('colorsci_h.html')
show(p)


In [600]:
# add a hover tool that displays the actual segment of text

#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')
source = ColumnDataSource(master_dfh)
# Add circle glyphs to figure p
p.circle('indices', 'cluster', size=8, color = 'palette_sci', source=source)

# Create a HoverTool: hover
tooltips = [('text', '@name')]
hover = HoverTool(tooltips=tooltips)

# Add the hover tool to the figure p
p.add_tools(hover)

# Specify the name of the output file and show the result
output_file('hoverclusters_h.html')
show(p)

**50-word segments**

In [535]:
### Colors from Bokeh:
from bokeh.palettes import Category20
colors_f= Category20[ward_f.n_clusters]
print(colors_f)

['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', '#d62728']


In [536]:
# print the top ten words of each cluster
# in a different color using the HTML function 
from IPython.core.display import HTML

html_text= " "
for i in range(ward_f.n_clusters):
    words = ' '.join(top_words_f[i])
    text = "<p>Cluster "+str(i)+": <font color="+colors_f[i]+">"+words+"</font></p>"
    html_text += text
HTML(html_text)

In [537]:
# indices: 194 segments
indices_f = [i for i in range(194)]
#print(indices)

In [538]:
# List of the segments
list_of_segs_f = []
for doc in segs_fifty:
    for seg in doc:
        list_of_segs_f.append(seg)
len(list_of_segs_f)
#print(list_of_segs[0])

194

In [539]:
# 7) retrieve the list of labels assigned to each chunk
labels_f = label_f
set(labels_f)

{0, 1, 2, 3, 4, 5, 6}

In [540]:
# assign the segments to their documents
doc_id_f = []
current_doc = 0

for i,doc in enumerate(segs_fifty):
    for seg in doc:
        doc_id_f.append(i)
print(len(doc_id_f))


194


In [541]:
# add a name/scientist_f for each student
name_f = []
scientist_f= []
for i in doc_id_f:
    if i == 0:
        name_f.append('amina')
        scientist_f.append('no')
    elif i == 1:
        name_f.append('ben')
        scientist_f.append('no')
    elif i == 2:
        name_f.append('daniel')
        scientist_f.append('yes')
    elif i == 3:
        name_f.append('emma')
        scientist_f.append('maybe')
    elif i == 4:
        name_f.append('james')
        scientist_f.append('maybe')
    elif i == 5:
        name_f.append('jenny')
        scientist_f.append('no')
    elif i == 6:
        name_f.append('kyle')
        scientist_f.append('yes')
    elif i == 7:
        name_f.append('leila')
        scientist_f.append('no')
    elif i == 8:
        name_f.append('maya')
        scientist_f.append('maybe')        
    elif i == 9:
        name_f.append('michelle')
        scientist_f.append('no')
    elif i == 10:
        name_f.append('mike')
        scientist_f.append('no')
    elif i == 11:
        name_f.append('nicole')
        scientist_f.append('yes')
    elif i == 12:
        name_f.append('samuel')
        scientist_f.append('maybe')
    elif i == 13:
        name_f.append('sasha')
        scientist_f.append('yes')
    elif i == 14:
        name_f.append('simon')
        scientist_f.append('no')
        
# make numeric list of scientist so can make colors for it
scicolor_f = []      
for i in scientist_f:
    if i == 'yes':
        scicolor_f.append(1)
    elif i == 'no':
        scicolor_f.append(2)
    elif i == 'maybe':
        scicolor_f.append(3)

In [580]:
# assign colors for scientist/not scientist

colors_f= Category10[4]

print(set(colors_f))
palette_f_sci = []
for sci in scicolor_f:
    color = colors_f[sci]
    palette_f_sci.append(color)
print(set(palette_f_sci))
print(scientist_f[0:11])
print(palette_f_sci[0:11])
print(len(palette_f_sci))



{'#2ca02c', '#1f77b4', '#ff7f0e', '#d62728'}
{'#2ca02c', '#d62728', '#ff7f0e'}
['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no']
['#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c']
194


In [581]:
# assign the corresponding cluster color to each student

colors_f= Category20[15]

print(set(colors_f))
palette_f_stu = []
for doc_id in doc_id_f:
    color = colors_f[doc_id]
    palette_f_stu.append(color)
print(set(palette_f_stu))
print(doc_id_f[0:11])
print(palette_f_stu[0:11])
print(len(palette_f_stu))

{'#aec7e8', '#ff9896', '#c5b0d5', '#d62728', '#8c564b', '#f7b6d2', '#1f77b4', '#98df8a', '#e377c2', '#9467bd', '#ff7f0e', '#ffbb78', '#2ca02c', '#c49c94', '#7f7f7f'}
{'#aec7e8', '#ff9896', '#c5b0d5', '#d62728', '#8c564b', '#f7b6d2', '#1f77b4', '#98df8a', '#e377c2', '#9467bd', '#ff7f0e', '#ffbb78', '#2ca02c', '#c49c94', '#7f7f7f'}
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4']
194


In [582]:
# assign corresponding color for each cluster
colors_f= Category20[ward_f.n_clusters]

palette_f_clu = []
for label in labels_f:
    color = colors_f[label]
    palette_f_clu.append(color)
print(set(palette_f_clu))
print(labels_f[0:11])
print(palette_f_clu[0:11])
print(len(palette_f_clu))

{'#aec7e8', '#d62728', '#1f77b4', '#98df8a', '#ff7f0e', '#ffbb78', '#2ca02c'}
[0 0 0 3 3 3 3 3 3 3 3]
['#1f77b4', '#1f77b4', '#1f77b4', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78']
194


In [543]:
# Double check all the lists are right
print(len(indices_f))
print(len(list_of_segs_f))
print(len(labels_f))
print(len(doc_id_f))
print(len(palette_f))
print(len(name_f))
print(len(scientist_f))

194
194
194
194
194
194
194


In [583]:
#create a dictionary using the lists from above
master_f = {'indices': indices_f,
          'seg': list_of_segs_f, 
          'cluster': labels_f,
          'document': doc_id_f, 
            'palette_sci': palette_f_sci, 
            'palette_stu': palette_f_stu,
            'palette_clu': palette_f_clu,
           'name' : name_f,
           'scientist' : scientist_f}

In [584]:

# convert the dictionary into a pandas dataframe

master_dff = pd.DataFrame(master_f)


In [546]:
# #create the plot using circles 
# from bokeh.plotting import ColumnDataSource, figure, show
# from bokeh.io import output_notebook, curdoc, output_file
# from bokeh.models import HoverTool, Select, Slider
# from bokeh.layouts import row, column

# #create a figure:
# p = figure(plot_width=400, plot_height=400)

# # Create a ColumnDataSource from df: source
# source = ColumnDataSource(master_dff)

# # Add circle glyphs to the figure p
# p.circle('indices', 'cluster', size=8, source=source)

# #show the plot
# output_file('clusters_f.html')
# show(p)


In [585]:
#add colors for each cluster

#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dff)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, color = 'palette_clu', source=source)

# Specify the name of the output file and show the result
output_file('colorclusters_f.html')
show(p)


In [587]:
#add colors for each student

#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dff)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, color = 'palette_stu', source=source)

# Specify the name of the output file and show the result
output_file('colorclusters_f.html')
show(p)

In [601]:
# add a hover tool that displays the actual segment of text

#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')
source = ColumnDataSource(master_dff)
# Add circle glyphs to figure p
p.circle('indices', 'cluster', size=8, color = 'palette_sci', source=source)

# Create a HoverTool: hover
tooltips = [('text', '@name')]
hover = HoverTool(tooltips=tooltips)

# Add the hover tool to the figure p
p.add_tools(hover)

# Specify the name of the output file and show the result
output_file('hoverclusters_f.html')
show(p)

**all transcripts**

In [549]:
### Colors from Bokeh:
from bokeh.palettes import Category20
colors_a= Category20[ward_a.n_clusters]
print(colors_a)

['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78']


In [550]:
# 4) use the example below to print the top ten words of each cluster
# in a different color using the HTML function 
from IPython.core.display import HTML

html_text= " "
for i in range(ward_a.n_clusters):
    words = ' '.join(top_words_a[i])
    text = "<p>Cluster "+str(i)+": <font color="+colors_a[i]+">"+words+"</font></p>"
    html_text += text
HTML(html_text)

In [551]:
# indices: 15 documents
indices_a = [i for i in range(15)]
#print(indices)

In [552]:
# List of the segments
list_of_docs = []
for doc in docs:
    list_of_docs.append(doc)
len(list_of_docs)
#print(list_of_segs[0])

15

In [553]:
# 7) retrieve the list of labels assigned to each chunk
labels_a = label_a
set(labels_a)

{0, 1, 2, 3}

In [554]:
# assign the segments to their documents
doc_id_a = []

for i,doc in enumerate(docs):
        doc_id_a.append(i)
print(len(doc_id_a))


15


In [555]:
# add a name/scientist_a for each student
name_a = []
scientist_a= []
for i in doc_id_a:
    if i == 0:
        name_a.append('amina')
        scientist_a.append('no')
    elif i == 1:
        name_a.append('ben')
        scientist_a.append('no')
    elif i == 2:
        name_a.append('daniel')
        scientist_a.append('yes')
    elif i == 3:
        name_a.append('emma')
        scientist_a.append('maybe')
    elif i == 4:
        name_a.append('james')
        scientist_a.append('maybe')
    elif i == 5:
        name_a.append('jenny')
        scientist_a.append('no')
    elif i == 6:
        name_a.append('kyle')
        scientist_a.append('yes')
    elif i == 7:
        name_a.append('leila')
        scientist_a.append('no')
    elif i == 8:
        name_a.append('maya')
        scientist_a.append('maybe')        
    elif i == 9:
        name_a.append('michelle')
        scientist_a.append('no')
    elif i == 10:
        name_a.append('mike')
        scientist_a.append('no')
    elif i == 11:
        name_a.append('nicole')
        scientist_a.append('yes')
    elif i == 12:
        name_a.append('samuel')
        scientist_a.append('maybe')
    elif i == 13:
        name_a.append('sasha')
        scientist_a.append('yes')
    elif i == 14:
        name_a.append('simon')
        scientist_a.append('no')
        
# make numeric list of scientist so can make colors for it
scicolor_a = []      
for i in scientist_a:
    if i == 'yes':
        scicolor_a.append(1)
    elif i == 'no':
        scicolor_a.append(2)
    elif i == 'maybe':
        scicolor_a.append(3)

In [556]:
# assign colors for scientist/not scientist

colors_a= Category10[4]

print(set(colors_a))
palette_a = []
for sci in scicolor_a:
    color = colors_a[sci]
    palette_a.append(color)
print(set(palette_a))
print(scientist_a[0:11])
print(palette_a[0:11])
print(len(palette_a))

# assign the corresponding cluster color to each student

# colors_a= Category20[15]
# print(colors_a)

# print(set(colors_a))
# palette_a = []
# for doc_id in doc_id_a:
#     color = colors_a[doc_id]
#     palette_a.append(color)
# print(set(palette_a))
# print(doc_id_a[0:11])
# print(palette_a[0:11])
# print(len(palette_a))

{'#2ca02c', '#1f77b4', '#ff7f0e', '#d62728'}
{'#2ca02c', '#d62728', '#ff7f0e'}
['no', 'no', 'yes', 'maybe', 'maybe', 'no', 'yes', 'no', 'maybe', 'no', 'no']
['#2ca02c', '#2ca02c', '#ff7f0e', '#d62728', '#d62728', '#2ca02c', '#ff7f0e', '#2ca02c', '#d62728', '#2ca02c', '#2ca02c']
15


In [557]:
# Double check all the lists are right
print(len(indices_a))
print(len(list_of_docs))
print(len(labels_a))
print(len(doc_id_a))
print(len(palette_a))
print(len(name_a))
print(len(scientist_a))

15
15
15
15
15
15
15


In [558]:
#create a dictionary using the lists from above
master_a = {'indices': indices_a,
          'seg': list_of_docs, 
          'cluster': labels_a,
          'document': doc_id_a, 
          'palette': palette_a, 
           'name' : name_a, 
            'scientist': scientist_a}

In [559]:

# convert the dictionary into a pandas dataframe

master_dfa = pd.DataFrame(master_a)


In [560]:
#create the plot using circles 
from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook, curdoc, output_file
from bokeh.models import HoverTool, Select, Slider
from bokeh.layouts import row, column

#create a figure:
p = figure(plot_width=400, plot_height=400)

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dfa)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, source=source)

#show the plot
output_file('clusters_a.html')
show(p)


In [561]:
#add colors for each cluster

#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dfa)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, color = 'palette', source=source)

# Specify the name of the output file and show the result
output_file('colorclusters_a.html')
show(p)


In [562]:
# add a hover tool that displays the actual segment of text

#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')
source = ColumnDataSource(master_dfa)
# Add circle glyphs to figure p
p.circle('indices', 'cluster', size=8, color = 'palette', source=source)

# Create a HoverTool: hover
tooltips = [('text', '@seg')]
hover = HoverTool(tooltips=tooltips)

# Add the hover tool to the figure p
p.add_tools(hover)

# Specify the name of the output file and show the result
output_file('hoverclusters_a.html')
show(p)

### SLIDER??

In [563]:
# # 16) create a slider that allows to slide through documents
# # https://github.com/bokeh/bokeh/blob/master/examples/howto/server_embed/notebook_embed.ipynb

# #tried this for a minute but didn't get it...

# # Perform the necessary imports
# from bokeh.io import curdoc
# from bokeh.layouts import widgetbox
# from bokeh.models import Slider

# # create a figure:
# p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')
# source = ColumnDataSource(master_dfh)
# p.circle('indices', 'cluster', size=8, color = 'palette', source=source)
# # Create a HoverTool: hover
# tooltips = [('text', '@seg')]
# hover = HoverTool(tooltips=tooltips)
# p.add_tools(hover)

# # # Create a slider: slider
# slider = Slider(title='Student', start=0, end=14, step=1, value=0)

# # Create a column layout: layout
# layout = column(widgetbox(slider), p)

# # Add the layout to the current document
# curdoc().add_root(layout)




# # # Add the layout to the current document
# # # Add the hover tool to the figure p
# # p.add_tools(hover)
# # p.add_root(layout)

# # # Specify the name of the output file and show the result
# # output_file('sliderclusters.html')
# # show(p)

In [564]:

# import yaml

# from bokeh.layouts import column
# from bokeh.models import ColumnDataSource, Slider
# from bokeh.plotting import figure
# from bokeh.themes import Theme
# from bokeh.io import show, output_notebook

# output_notebook()

In [565]:

# source = ColumnDataSource(data=master_dfh)
# p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')
# p.circle('indices', 'cluster', size=8, color = 'palette', source=source)

# def callback(attr, old, new):
#     scale = slider.value
#     new_data= source.loc[source[:,'document']==slider.value]
#     source.data = dict( new_data

# slider = Slider(title='Student', start=0, end=14, step=1, value=0)
#     # Attach the callback to the 'value' property of slider
# slider.on_change('value',callback)

# layout = column(widgetbox(slider), p)
# curdoc().add_root(layout)
    
# show(p)

In [566]:

# def modify_doc(doc):
#     source = ColumnDataSource(data=master_dfh)
#     p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')
#     p.circle('indices', 'cluster', size=8, color = 'palette', source=source)

#     def callback(attr, old, new):
#         scale = slider.value
#         new_data= source.loc[source[:,'document']==slider.value]
#         source.data = new_data

#     slider = Slider(title='Student', start=0, end=14, step=1, value=0)
#     # Attach the callback to the 'value' property of slider
#     slider.on_change('value',callback)

#     doc.add_root(column(slider, plot))
# show(modify_doc,notebook_url="http://localhost:8888")
    

# #     doc.theme = Theme(json=yaml.load("""
# #         attrs:
# #             Figure:
# #                 background_fill_color: "#DDDDDD"
# #                 outline_line_color: white
# #                 toolbar_location: above
# #                 height: 500
# #                 width: 800
# #             Grid:
# #                 grid_line_dash: [6, 4]
# #                 grid_line_color: white
#     #"""))

In [567]:
# 17) create your own visualization! 