## Visualizing 50-word segments by students April 26 draft

Visualizations of the 50-word segment deviation vector clusters  
Using the hierarchical agglomerative clustering method  

S435 Final Project: Student Science identity  
Data: Interviews with 15 7th grade students from EcoXPT evaluation  

Analysis notebook here:  
/Volumes/GoogleDrive/My Drive/Spring 2019/S435/GitHub/final-project-EileenMcGivney/April26-Analysis-McGivney.ipynb   

In [17]:
## Setup
import os
import re
import glob
import math
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

import nltk
#nltk.download('punkt')
from collections import Counter

from collections import defaultdict
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import NMF


from bokeh.palettes import Category20, Category10

%cd '/Volumes/GoogleDrive/My Drive/Spring 2019/S435/GitHub/final-project-EileenMcGivney'

/Volumes/GoogleDrive/My Drive/Spring 2019/S435/GitHub/final-project-EileenMcGivney


In [18]:
## Load data
# Deviation vectors:

dff = pd.read_csv('./dataframes/segments_fifty_deviance.csv', index_col=0)

# Segments:
with open('./cleandocs/segs_fifty.txt', 'rb') as fp:
    segs_fifty = pickle.load(fp)
    
### cleaned transcripts:
docs = []
for i in range (15):
    with open('./cleandocs/doc'+str(i)+'.txt', 'rb') as fp:
        docs.append(pickle.load(fp))

### Clustering the data

In [19]:
## Hierarchical Agglomerative Clustering: 
# 50-word segments:
ward_f = AgglomerativeClustering(n_clusters=7, linkage='ward').fit(dff.values)
label_f = ward_f.labels_

print("Number of points: %i" % label_f.size)

clf_f = NearestCentroid()
clf_f.fit(dff.values, label_f)

print(clf_f.centroids_.shape)

Number of points: 194
(7, 992)


In [20]:
#print the top 10 words for each cluster centroid
# make a function to print the words:
def visualize_clusters(df, n_clusters, centroids, n_words=10, printed=True):   
    # try to get the most informative words of each cluster
    words = {}
    vocabulary = df.columns
    for n in range(n_clusters):
        words[n] = []
        if printed: print('CLUSTER ' + str(n+1) + ': ', end='')
        arr = centroids[n]
        indices = arr.argsort()[-n_words:]
        for i in indices:
            if printed: print(vocabulary[i], end=', '),
            words[n].append(vocabulary[i])
        print('')
    return words

In [21]:
# Top Ten words for each cluster: 50-word segments and 100-word segments:
top_words_f = visualize_clusters(dff, clf_f.centroids_.shape[0], clf_f.centroids_)

CLUSTER 1: lead, people, bees, went, figure, time, always, problem, go, different, 
CLUSTER 2: scientist, going, make, could, thought, outside, go, try, fun, they, 
CLUSTER 3: happens, instead, use, able, look, interesting, living, things, different, think, 
CLUSTER 4: figure, job, part, law, someone, something, my, didn, science, know, 
CLUSTER 5: general, finding, around, things, dying, cause, look, died, guess, fish, 
CLUSTER 6: around, hard, saw, us, idea, teacher, year, would, put, see, 
CLUSTER 7: life, ecoxpt, real, anything, fun, water, think, inside, would, things, 


### Visualizing the clusters by each transcript

In [22]:
### Colors from Bokeh:
from bokeh.palettes import Category20
colors_f= Category20[ward_f.n_clusters]
print(colors_f)

['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', '#d62728']


In [23]:
# print the top ten words of each cluster
# in a different color using the HTML function 
from IPython.core.display import HTML

html_text= " "
for i in range(ward_f.n_clusters):
    words = ' '.join(top_words_f[i])
    text = "<p>Cluster "+str(i)+": <font color="+colors_f[i]+">"+words+"</font></p>"
    html_text += text
HTML(html_text)

In [24]:
# indices: 194 segments
indices_f = [i for i in range(194)]
#print(indices)

In [25]:
# List of the segments
list_of_segs_f = []
for doc in segs_fifty:
    for seg in doc:
        list_of_segs_f.append(seg)
len(list_of_segs_f)
#print(list_of_segs[0])

194

In [26]:
# 7) retrieve the list of labels assigned to each chunk
labels_f = label_f
set(labels_f)

{0, 1, 2, 3, 4, 5, 6}

In [27]:
# assign the segments to their documents
doc_id_f = []
current_doc = 0

for i,doc in enumerate(segs_fifty):
    for seg in doc:
        doc_id_f.append(i)
print(len(doc_id_f))


194


In [28]:
# add a name/scientist_f for each student
name_f = []
scientist_f= []
for i in doc_id_f:
    if i == 0:
        name_f.append('amina')
        scientist_f.append('no')
    elif i == 1:
        name_f.append('ben')
        scientist_f.append('no')
    elif i == 2:
        name_f.append('daniel')
        scientist_f.append('yes')
    elif i == 3:
        name_f.append('emma')
        scientist_f.append('maybe')
    elif i == 4:
        name_f.append('james')
        scientist_f.append('maybe')
    elif i == 5:
        name_f.append('jenny')
        scientist_f.append('no')
    elif i == 6:
        name_f.append('kyle')
        scientist_f.append('yes')
    elif i == 7:
        name_f.append('leila')
        scientist_f.append('no')
    elif i == 8:
        name_f.append('maya')
        scientist_f.append('maybe')        
    elif i == 9:
        name_f.append('michelle')
        scientist_f.append('no')
    elif i == 10:
        name_f.append('mike')
        scientist_f.append('no')
    elif i == 11:
        name_f.append('nicole')
        scientist_f.append('yes')
    elif i == 12:
        name_f.append('samuel')
        scientist_f.append('maybe')
    elif i == 13:
        name_f.append('sasha')
        scientist_f.append('yes')
    elif i == 14:
        name_f.append('simon')
        scientist_f.append('no')
        
# make numeric list of scientist so can make colors for it
scicolor_f = []      
for i in scientist_f:
    if i == 'yes':
        scicolor_f.append(1)
    elif i == 'no':
        scicolor_f.append(2)
    elif i == 'maybe':
        scicolor_f.append(3)

*(I relaize now this would have been better done with a dictionary but I didn't think about that at the time!)*

In [29]:
# assign colors for scientist/not scientist
colors_f= Category10[4]
print(set(colors_f))
palette_f_sci = []
for sci in scicolor_f:
    color = colors_f[sci]
    palette_f_sci.append(color)
print(set(palette_f_sci))
print(scientist_f[0:11])
print(palette_f_sci[0:11])
print(len(palette_f_sci))

### I should change this so the colors make more sense- like green for yes, yellow maybe, red for no

{'#1f77b4', '#2ca02c', '#d62728', '#ff7f0e'}
{'#2ca02c', '#d62728', '#ff7f0e'}
['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no']
['#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c', '#2ca02c']
194


In [30]:
# assign the corresponding cluster color to each student
colors_f= Category20[15]
print(set(colors_f))
palette_f_stu = []
for doc_id in doc_id_f:
    color = colors_f[doc_id]
    palette_f_stu.append(color)
print(set(palette_f_stu))
print(doc_id_f[0:11])
print(palette_f_stu[0:11])
print(len(palette_f_stu))

{'#aec7e8', '#c49c94', '#98df8a', '#7f7f7f', '#1f77b4', '#f7b6d2', '#e377c2', '#c5b0d5', '#9467bd', '#ffbb78', '#2ca02c', '#d62728', '#8c564b', '#ff7f0e', '#ff9896'}
{'#aec7e8', '#c49c94', '#98df8a', '#7f7f7f', '#1f77b4', '#f7b6d2', '#e377c2', '#c5b0d5', '#9467bd', '#ffbb78', '#2ca02c', '#d62728', '#8c564b', '#ff7f0e', '#ff9896'}
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4']
194


In [31]:
# assign corresponding color for each cluster
colors_f= Category20[ward_f.n_clusters]
palette_f_clu = []
for label in labels_f:
    color = colors_f[label]
    palette_f_clu.append(color)
print(set(palette_f_clu))
print(labels_f[0:11])
print(palette_f_clu[0:11])
print(len(palette_f_clu))

{'#aec7e8', '#98df8a', '#1f77b4', '#ffbb78', '#2ca02c', '#d62728', '#ff7f0e'}
[0 0 0 3 3 3 3 3 3 3 3]
['#1f77b4', '#1f77b4', '#1f77b4', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78']
194


In [32]:
# Double check all the lists are right
print(len(indices_f))
print(len(list_of_segs_f))
print(len(labels_f))
print(len(doc_id_f))
print(len(palette_f_sci))
print(len(palette_f_clu))
print(len(palette_f_stu))
print(len(name_f))
print(len(scientist_f))

194
194
194
194
194
194
194
194
194


In [33]:
#create a dictionary using the lists from above
master_f = {'indices': indices_f,
          'seg': list_of_segs_f, 
          'cluster': labels_f,
          'document': doc_id_f, 
            'palette_sci': palette_f_sci, 
            'palette_stu': palette_f_stu,
            'palette_clu': palette_f_clu,
           'name' : name_f,
           'scientist' : scientist_f}

In [34]:
# convert the dictionary into a pandas dataframe
master_dff = pd.DataFrame(master_f)

## Making plots: color code by cluster, student, scientist

In [40]:
#create the plot using circles 
from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook, curdoc, output_file
from bokeh.models import HoverTool, Select, Slider
from bokeh.layouts import row, column

#create a figure:
p = figure(plot_width=400, plot_height=400)

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dff)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, source=source)

#show the plot
output_file('clusters_f.html')
show(p)

In [41]:
#add colors for each cluster
#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dff)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, color = 'palette_clu', source=source)

# Specify the name of the output file and show the result
output_file('colorclusters_f.html')
show(p)

In [42]:
#add colors for each student
#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')

# Create a ColumnDataSource from df: source
source = ColumnDataSource(master_dff)

# Add circle glyphs to the figure p
p.circle('indices', 'cluster', size=8, color = 'palette_stu', source=source)

# Specify the name of the output file and show the result
output_file('colorstudent_f.html')
show(p)

In [43]:
# add a hover tool that displays the name of the student, color for yes/no/maybe scientist
#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')
source = ColumnDataSource(master_dff)
# Add circle glyphs to figure p
p.circle('indices', 'cluster', size=8, color = 'palette_sci', source=source)

# Create a HoverTool: hover
tooltips = [('text', '@name')]
hover = HoverTool(tooltips=tooltips)

# Add the hover tool to the figure p
p.add_tools(hover)

# Specify the name of the output file and show the result
output_file('hoverstudent_f.html')
show(p)

In [44]:
# add a hover tool that displays the name of the student, color for the student
#create a figure:
p = figure(x_axis_label = 'segment', y_axis_label = "cluster", title = 'Student Science Itentity Transcript Clusters')
source = ColumnDataSource(master_dff)
# Add circle glyphs to figure p
p.circle('indices', 'cluster', size=8, color = 'palette_stu', source=source)

# Create a HoverTool: hover
tooltips = [('text', '@seg')]
hover = HoverTool(tooltips=tooltips)

# Add the hover tool to the figure p
p.add_tools(hover)

# Specify the name of the output file and show the result
output_file('hoversegment_f.html')
show(p)