## Data Cleaning

In [1]:
import csv
import os
import shutil

In [2]:
# creates a folder for all text files

path = '/Users/erincarvalho/Desktop/dev/final-project-Erin-c'
if os.path.isdir(path + '/txt_files'):
    shutil.rmtree(path + '/txt_files', ignore_errors=False, onerror=None)
os.mkdir(path + '/txt_files')

In [3]:
# creates a separate text file for each topic with all posts and replies from csv
# ScratchEd_all_data.csv

ids = []

with open('ScratchEd_all_data.csv', "r", encoding='utf-8', errors='ignore') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    count = 0

    for idx, row in enumerate(csv_reader):   
        if str(row[0]) in ids:
            filename = path + '/txt_files/topic_' + str(row[0]) + '.txt'
            file = open(filename,'a+')
            contents = str(row[3]) + '\r\n' + '\r\n'
            file.write(contents)
        else:
            filename = path + '/txt_files/topic_' + str(row[0]) + '.txt'
            file = open(filename,'a+')
            contents = str(row[3]) + '\r\n' + '\r\n'
            file.write(contents)
            ids.append(str(row[0]))
            count += 1
    print(count)
    print(len(ids))

1444
1444


In [4]:
import glob

# save all the text files in a list

threads = glob.glob('./txt_files/*.txt')
print(len(threads))

1444


In [5]:
documents = []

# load actual text into a list

for thread in threads: 
    with open (thread, "r", encoding='utf-8', errors='ignore') as t:
        documents.append(t.read())
        
# convert text to all lowercase

for i, t in enumerate(threads):
    documents[i] = documents[i].lower()

In [6]:
punctuation = ['.', '...', '!', '#', '"', '%', '$', "'", '&', ')', 
               '(', '+', '*', '-', ',', '/', '.', ';', ':', '=', 
               '<', '?', '>', '@', '",', '".', '[', ']', '\\', ',',
               '_', '^', '`', '{', '}', '|', '~', '−', '”', '“', '’']

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 
              'ourselves', 'you', 'your', 'yours', 'yourself', 
              'yourselves', 'he', 'him', 'his', 'himself', 'she', 
              'her', 'hers', 'herself', 'it', 'its', 'itself', 
              'they', 'them', 'their', 'theirs', 'themselves', 
              'what', 'which', 'who', 'whom', 'this', 'that', 
              'these', 'those', 'am', 'is', 'are', 'was', 'were', 
              'be', 'been', 'being', 'have', 'has', 'had', 'having', 
              'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 
              'but', 'if', 'or', 'because', 'as', 'until', 'while', 
              'of', 'at', 'by', 'for', 'with', 'about', 'against', 
              'between', 'into', 'through', 'during', 'before', 
              'after', 'above', 'below', 'to', 'from', 'up', 'down', 
              'in', 'out', 'on', 'off', 'over', 'under', 'again', 
              'further', 'then', 'once', 'here', 'there', 'when', 
              'where', 'why', 'how', 'all', 'any', 'both', 'each', 
              'few', 'more', 'most', 'other', 'some', 'such', 'no', 
              'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
              'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 
              'now', 'http', 'https', 'edu', 'www', 'com', 'scratch', 'mit', 'org', 
              'would', 'should', 'could', 'might', 'really', 'very', 'good', 
              'great', 'best', 'karen', '྾explore', '྾interact', '྾network']

In [7]:
def clean_list_of_documents(documents):
    '''cleans a list of documents'''
    
    cleaned_docs = []
    
    for i,doc in enumerate(documents):
        # removes new lines and carriage returns
        doc = doc.replace('\n', ' ')
        doc = doc.replace('\r', ' ')
        # remove ponctuation
        for punc in punctuation: 
            doc = doc.replace(punc, ' ')
        # remove numbers
        for i in range(10):
            doc = doc.replace(str(i), ' ')
        # remove stop words
        for stop_word in stop_words:
            doc = doc.replace(' ' + stop_word + ' ', ' ')
        # remove single characters and stem the words 
        doc = [x for x in doc.split() if len(x) > 2]
        doc = " ".join(doc)
        # save the result to our list of documents
        cleaned_docs.append(doc)
        
    return cleaned_docs

In [8]:
# Print the first bit of the document for sanity

clean_docs = clean_list_of_documents(documents)

print(clean_docs[0][:100])

may cambridge educator meetup attendees andrea blake family steven connelly janet dee jing ding ingr


## Vocabulary

In [9]:
# !pip3 install nltk
# !nltk.download("wordnet", "./")

import math
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import Counter, defaultdict

In [10]:
def get_vocabulary(documents):
    '''builds a vocabulary'''

    lemmatized_vocabulary = []
    lemmatizer = WordNetLemmatizer()

    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    for document in clean_docs:
        tokens = word_tokenize(document)
        for word, tag in pos_tag(tokens):
            word = lemmatizer.lemmatize(word, tag_map[tag[0]])
            if wn.synsets(word):
                if word not in lemmatized_vocabulary: 
                    lemmatized_vocabulary.append(word)

    lemmatized_vocabulary = list(set(lemmatized_vocabulary))
    lemmatized_vocabulary.sort()

    return lemmatized_vocabulary

In [11]:
# Print the length of lemmatized vocabulary

vocabulary = get_vocabulary(clean_docs)
print(len(vocabulary))

7724


In [12]:
def flatten_and_overlap(documents, window_size=100, overlap=25):
    
    # create the list of overlapping documents
    new_list_of_documents = []
    
    # flatten everything into one string
    flat = ""
    for document in documents:
        flat += document
    
    # split into words
    flat = flat.split()

    # create chunks of 100 words
    high = window_size
    while high < len(flat):
        low = high - window_size
        new_list_of_documents.append(flat[low:high])
        high += overlap
    return new_list_of_documents

In [13]:
chunks = flatten_and_overlap(clean_docs)

In [14]:
import pandas as pd
df = pd.DataFrame(0, index=np.arange(len(chunks)), columns=vocabulary)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8190 entries, 0 to 8189
Columns: 7724 entries, aaron to zoom
dtypes: int64(7724)
memory usage: 482.7 MB


In [15]:
def docs_by_words_df(chunks, vocabulary):
    df = pd.DataFrame(0, index=np.arange(len(chunks)), columns=vocabulary)
    
    # fill out the matrix with counts
    for i,chunk in enumerate(chunks):
        for word in chunk:
            if word in df.columns: 
                df.loc[i,word] += 1
            
    return df

In [16]:
print(chunks[0])
print(len(chunks))

['may', 'cambridge', 'educator', 'meetup', 'attendees', 'andrea', 'blake', 'family', 'steven', 'connelly', 'janet', 'dee', 'jing', 'ding', 'ingrid', 'gustafson', 'kelly', 'fischer', 'brittany', 'haehlen', 'michael', 'mclaughlin', 'colin', 'meltzer', 'ivan', 'rudnicki', 'alina', 'spaulding', 'sandra', 'thaxter', 'gayla', 'webb', 'recap', 'part', 'day', 'kickoff', 'kick', 'may', 'meetup', 'kennedy', 'longfellow', 'school', 'cambridge', 'streamed', 'opening', 'ceremonies', 'day', 'media', 'lab', 'gave', 'chance', 'everyone', 'little', 'breakfast', 'start', 'networking', 'instantly', 'people', 'sharing', 'ideas', 'exploring', 'many', 'resources', 'room', 'many', 'new', 'faces', 'paused', 'conversations', 'introduced', 'sharing', 'brought', 'meetup', 'part', 'play', 'time', 'group', 'decided', 'continue', 'conversations', 'explorations', 'people', 'already', 'started', 'janet', 'mike', 'worked', 'strategies', 'learning', 'order', 'teach', 'students', 'steve', 'tried', 'block', 'challenge', 

In [17]:
df = docs_by_words_df(chunks, vocabulary)
df.loc[0,'school']

1

In [18]:
def one_plus_log(cell):
    if cell != 0: 
        return 1 + math.log(cell)
    else:
        return 0

In [19]:
df_log = df.applymap(one_plus_log)

In [20]:
def one_plus_log_mat(df):
    df = df.applymap(one_plus_log)
    return df.values

In [21]:
print("before one + log: ", df.loc[0,'school'])
print("after one + log: ", 1 + math.log(df.loc[0,'school']))
print("Value in the dataframe: ", df_log.loc[0,'school'])

before one + log:  1
after one + log:  1.0
Value in the dataframe:  1.0


In [22]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
df_log[df_log.columns] = scaler.fit_transform(df_log[df_log.columns])
df_log[df_log.columns[100:600]]

Unnamed: 0,administer,administration,administrator,admire,admission,admit,ado,adobe,adolescent,adopt,...,band,bandwidth,bang,bangalore,bangkok,bank,banner,bar,barb,barber
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

def normalize_df(df, method='Normalizer'):
    
    # choose the normalization strategy
    scaler = None
    if method == 'Normalizer': scaler = Normalizer()
    if method == 'MinMaxScaler': scaler = MinMaxScaler()
    if method == 'StandardScaler': scaler = StandardScaler()
        
    # apply the normalization
    if scaler != None:
        df[df.columns] = scaler.fit_transform(df[df.columns])

    # return the resulting dataframe
    return df

In [24]:
v_sum = np.sum(df_log.values, axis=0)

In [25]:
def vector_length(u):
    return np.sqrt(np.dot(u, u))

def length_norm(u):
    return u / vector_length(u)

v_avg = length_norm(v_sum)

In [26]:
matrix = df_log.values

for row in range(df_log.shape[0]):

    # this is one vector (row
    v_i = matrix[row,:]

    # we subtract its component along v_average
    scalar = np.dot(v_i,v_avg)
    sub = v_avg * scalar

    # we replace the row by the deviation vector
    matrix[row,:] = length_norm(v_i - sub)

In [27]:
def vector_length(u):
    return np.sqrt(np.dot(u, u))

def length_norm(u):
    return u / vector_length(u)

def transform_deviation_vectors(df):
    
    # get the numpy matrix from the df
    matrix = df.values
    
    # compute the sum of the vectors
    v_sum = np.sum(matrix, axis=0)
    
    # normalize this vector (find its average)
    v_avg = length_norm(v_sum)
    
    # we iterate through each vector
    for row in range(df_log.shape[0]):
        
        # this is one vector (row
        v_i = matrix[row,:]
        
        # we subtract its component along v_average
        scalar = np.dot(v_i,v_avg)
        sub = v_avg * scalar
        
        # we replace the row by the deviation vector
        matrix[row,:] = length_norm(v_i - sub)
    
    return df

In [28]:
df = transform_deviation_vectors(df_log)

In [29]:
# from sklearn.cluster import KMeans

# ks = list(range(1, 10))
# inertias = []

# for k in ks:
    
#     # Create a KMeans instance with k clusters: model
#     kmeans = KMeans(n_clusters=k, max_iter=1000)
    
#     # Fit model to samples
#     kmeans.fit(df.values)
    
#     # Append the inertia to the list of inertias
#     inertias.append(kmeans.inertia_)

In [30]:
# import matplotlib.pyplot as plt

# %matplotlib inline

# # Plot ks vs inertias
# plt.plot(ks, inertias, '-o')
# plt.xlabel('number of clusters, k')
# plt.ylabel('inertia')
# plt.xticks(ks)
# plt.show()

In [31]:
# from sklearn.manifold import TSNE

# # Create a TSNE instance: model
# model = TSNE(learning_rate=200)

# # Apply fit_transform to samples: tsne_features
# tsne_features = model.fit_transform(df.values)

# # Select the 0th feature: xs
# xs = tsne_features[:,0]

# # Select the 1st feature: ys
# ys = tsne_features[:,1]

# # Scatter plot
# plt.scatter(xs,ys)
# plt.show()

In [32]:
import collections
from sklearn.cluster import KMeans
kmeans_obj = KMeans(n_clusters=10, max_iter=1000).fit(df.values)

n_words = 10
top_words = collections.defaultdict(lambda: [])

# iterate through each cluster
for n in range(kmeans_obj.n_clusters):

    print('CLUSTER ' + str(n+1) + ': ', end='')

    # get the cluster centers
    arr = kmeans_obj.cluster_centers_[n]

    # sorts the array and keep the last n words
    indices = arr.argsort()[-n_words:]

    # add the words to the list of words
    for i in indices:
        print(vocabulary[i], end=', ')
        top_words[n].append(vocabulary[i])
        
    print('')

CLUSTER 1: strategies, core, hands, underlying, experience, together, previous, practice, ideas, educators, 
CLUSTER 2: variable, right, use, make, one, script, sprites, blocks, block, sprite, 
CLUSTER 3: share, like, photo, status, found, twitter, news, please, post, week, 
CLUSTER 4: rosemary, attendees, led, networking, participants, recap, shared, group, session, breakout, 
CLUSTER 5: address, password, classroom, studio, email, create, student, class, teacher, account, 
CLUSTER 6: try, thank, work, kids, version, let, get, like, know, thanks, 
CLUSTER 7: started, another, dance, sprites, see, different, make, ideas, one, project, 
CLUSTER 8: creative, computing, may, online, workshop, join, scratched, event, day, conference, 
CLUSTER 9: news, week, well, major, stories, throughout, place, related, roundup, weekly, 
CLUSTER 10: teachers, learn, teaching, curriculum, teach, learning, science, computer, programming, school, 


In [33]:
from sklearn.cluster import AgglomerativeClustering

ward = AgglomerativeClustering(n_clusters=10, linkage='ward').fit(df.values)
label = ward.labels_

print("Number of points: %i" % label.size)

Number of points: 8190


In [34]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
import numpy as np

clf = NearestCentroid()
clf.fit(df.values, label)

print(clf.centroids_.shape)

(10, 7724)


In [35]:
def visualize_clusters(df, n_clusters, centroids, n_words=10, printed=True):   
    # try to get the most informative words of each cluster
    words = {}
    vocabulary = df.columns
    for n in range(n_clusters):
        words[n] = []
        if printed: print('CLUSTER ' + str(n+1) + ': ', end='')
        arr = centroids[n]
        indices = arr.argsort()[-n_words:]
        for i in indices:
            if printed: print(vocabulary[i], end=', '),
            words[n].append(vocabulary[i])
        print('')
    return words

top_words = visualize_clusters(df, clf.centroids_.shape[0], clf.centroids_)

CLUSTER 1: may, guide, computational, creative, join, computing, scratched, event, day, conference, 
CLUSTER 2: get, also, let, much, school, kids, use, like, know, thanks, 
CLUSTER 3: blog, place, status, photo, stories, weekly, twitter, roundup, news, week, 
CLUSTER 4: scripts, save, right, click, script, blocks, project, sprites, block, sprite, 
CLUSTER 5: want, hands, experience, lab, workshop, together, practice, previous, ideas, educators, 
CLUSTER 6: rosemary, leave, networking, name, connect, attendees, people, recap, session, breakout, 
CLUSTER 7: programming, technology, research, design, education, school, game, learning, science, computer, 
CLUSTER 8: thank, use, hope, classroom, email, create, student, class, teacher, account, 
CLUSTER 9: docs, google, les, ver, post, est, oscar, con, las, para, 
CLUSTER 10: also, school, art, think, technology, new, use, learn, goal, ideas, 


In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df[df.columns] = scaler.fit_transform(df[df.columns])

# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(df.values)

# Transform the articles: nmf_features
nmf_features = model.transform(df.values)

# Print the NMF features
print(nmf_features)

[[0.02382249 0.07686806 0.16183103 0.08191129 0.03672876 0.01606364]
 [0.03047881 0.08174334 0.15751444 0.07872419 0.02760871 0.0264112 ]
 [0.04079994 0.06215671 0.14460519 0.10949552 0.01812767 0.01708183]
 ...
 [0.01025815 0.0424608  0.07942485 0.06377861 0.08672434 0.07975598]
 [0.00581827 0.04228121 0.11288064 0.11281618 0.04481131 0.06715077]
 [0.00736164 0.06108385 0.11396305 0.13986215 0.02360953 0.06205359]]


In [37]:
import pandas as pd

# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=df.columns)

for i in range(6):

    # Select row 3: component
    component = components_df.iloc[i,:]

    # Print result of nlargest
    print(component.nlargest(n=10), '\n')

weekly        2.873122
roundup       2.857011
major         2.296703
throughout    2.275808
well          2.215372
place         2.080812
related       2.047795
stories       1.741672
team          1.621238
scratched     1.568141
Name: 0, dtype: float64 

educators       1.730656
practice        1.593330
underlying      1.592862
ideas           1.582568
previous        1.539312
want            1.348962
share           1.287056
together        1.281004
particularly    1.275531
one             1.266395
Name: 1, dtype: float64 

scratched     1.453851
event         1.254336
people        1.203541
conference    1.139476
saturday      1.117457
educator      1.064019
day           1.057672
place         1.046112
join          1.014014
weekly        0.923077
Name: 2, dtype: float64 

school         2.028813
programming    1.614602
computer       1.568414
weekly         1.198388
teaching       1.188241
week           1.181601
roundup        1.176377
curriculum     1.148266
learning       1.136

In [38]:
from sklearn.cluster import KMeans

In [39]:
import collections

def get_top_words(kmeans, centers, n_words=10):
    
    top_words = collections.defaultdict(lambda: [])

    # iterate through each cluster
    for n in range(kmeans.n_clusters):

        # get the cluster centers
        arr = centers[n]

        # sorts the array and keep the last n words
        indices = arr.argsort()[-n_words:]

        # add the words to the list of words
        for i in indices:
            top_words[n].append(vocabulary[i])
    
    return top_words

In [40]:
top_10_clusters = get_top_words(kmeans_obj, clf.centroids_)
print(top_10_clusters[0])

['may', 'guide', 'computational', 'creative', 'join', 'computing', 'scratched', 'event', 'day', 'conference']


In [41]:
from bokeh.palettes import Category10

colors = Category10[10]

In [42]:
from IPython.core.display import HTML

html_text = ""


for i in range(0,kmeans_obj.n_clusters):
    words=', '.join(top_words[i])
    color = colors[i]
    text = "<p>Cluster X: <font color='"+color+"'>"+words+"</font></p>"
    html_text += text
    
HTML(html_text)

In [43]:
indices =  list(range(0, len(chunks)))

In [44]:
list_of_chunks = [' '.join(chunks[i]) for i in indices]

In [45]:
labels = [kmeans_obj.labels_[i] for i in indices]

In [46]:
palette = [colors[labels[i]] for i in indices]

In [47]:
doc_id = []
current_doc = 0
next_doc = 1

# we go through all the chunks 
for chunk in list_of_chunks:
    next_doc = current_doc + 1
    if next_doc == len(clean_docs):
        doc_id.append(current_doc)
    else:
        if chunk in clean_docs[next_doc]:
            current_doc += 1
        doc_id.append(current_doc)

In [48]:
print(len(indices))
print(len(list_of_chunks))
print(len(labels))
print(len(doc_id))
print(len(palette))

8190
8190
8190
8190
8190


In [49]:
master = {'indices': indices,
          'chunk': list_of_chunks, 
          'cluster': labels,
          'document': doc_id, 
          'palette': palette }

In [50]:
master_df = pd.DataFrame.from_dict(master)

master_df.head(10)

Unnamed: 0,indices,chunk,cluster,document,palette
0,0,may cambridge educator meetup attendees andrea...,7,0,#7f7f7f
1,1,rudnicki alina spaulding sandra thaxter gayla ...,3,0,#d62728
2,2,chance everyone little breakfast start network...,3,0,#d62728
3,3,time group decided continue conversations expl...,9,0,#17becf
4,4,michael ivan discussion around ways keep sprit...,9,0,#17becf
5,5,colin sandra try extension little bits discuss...,9,0,#17becf
6,6,raspberry used variety settings help learn com...,9,0,#17becf
7,7,sons demonstrated elaborate little bits chain ...,9,0,#17becf
8,8,suggested using strategy make snake game andre...,9,0,#17becf
9,9,projects next meetup scheduled saturday june l...,9,0,#17becf


In [51]:
from bokeh.plotting import ColumnDataSource, figure, show, output_file
from bokeh.io import output_notebook, curdoc
from bokeh.models import HoverTool, Select, Slider
from bokeh.layouts import row, column

source = ColumnDataSource(master_df)

# Create a figure with the "box_select" tool: p
p = figure(tools='box_select',x_axis_label='indices',y_axis_label='document')

# Add circle glyphs to the figure p
p.circle('indices','document', source=source, color='green', size=8)

# Specify the name of the output file and show the result
output_file('output.html')
show(p)


In [52]:
source = ColumnDataSource(master_df)
p = figure(tools='box_select',x_axis_label='indices',y_axis_label='document')
p.circle('indices','cluster', source=source, color='palette', size=8)
output_file('output.html')
show(p)


In [53]:
# Create a HoverTool: hover
hover = HoverTool(tooltips=[('chunk', '@chunk')], mode='vline')

# Add hover tool to p
p.add_tools(hover)

# Show the new output with the hover tool
output_file('output.html')
show(p)

In [54]:
def visualize_clusters(results_clustering, top_words, vocabulary):
    text = ""

    for cluster, words in top_words.items(): 
        words = " ".join(words)
        color = colors[cluster]
        text += "<p>Cluster "+str(cluster)+": <font color='"+color+"'>"+words+"</font></p>"

    return text

In [55]:
def ExtractTopicsVSM(documents, numTopics):
    ''' this functions takes in a list of documents (strings), 
        runs topic modeling (as implemented by Sherin, 2013)
        and returns the clustering results, the matrix used 
        for clustering a visualization '''
    
    # step 2: clean up the documents
    documents = clean_list_of_documents(documents)
    
    # step 3: let's build the vocabulary of these docs
    vocabulary = get_vocabulary(documents)
    
    # step 4: we build our list of 100-words overlapping fragments
    documents = flatten_and_overlap(documents)
    
    # step 5: we convert the chunks into a matrix
    df = docs_by_words_df(documents, vocabulary)
    
    # step 6: we weight the frequency of words (count = 1 + log(count))
    df.values = one_plus_log_mat(df)
    
    # step 7: we normalize the matrix
    df.values = normalize_df(df, method='Normalizer')
    
    # step 8: we compute deviatio vectors
    df = transform_deviation_vectors(df)
    
    # step 9: we apply a clustering algorithm to find topics
    results_clustering = KMeans(n_clusters=numTopics, max_iter=1000).fit(df.values)
    
    # step 10: we get the top words for each cluster
    top_words = get_top_words(results_clustering, results_clustering.cluster_centers_)
    
    # step 11: we create a visualization for the topics
    visualization = visualize_clusters(results_clustering, top_words, vocabulary)
    
    # finally, we return the clustering results, the matrix, and a visualization
    return results_clustering, df, top_words, visualization

In [85]:
posts = {}
keys = []
values = []

for thread in threads:
    keys.append(thread[18:-4])

for document in documents:
    values.append(document)

for i in range(len(keys)):
    posts[keys[i]] = values[i]

print(posts)



In [128]:
def contains_word(string, word):
    return (' ' + word + ' ') in (' ' + string + ' ')

query = input('What are you searching for? ')

results = []

counter = 0

for key,val in posts.items():
    if contains_word(posts[key], query):
        counter += 1
        #print(val)
        results.append(val)
        # print(key, val)
print('counter is ' + str(counter))

clean_results = clean_list_of_documents(results)
result_vocabulary = get_vocabulary(clean_results)

print(clean_results, result_vocabulary)
print(len(clean_results))

What are you searching for? sprite
counter is 99
99


In [148]:
# !pip3 install gensim

from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
import itertools

tokenized_docs = [word_tokenize(doc) for doc in clean_results]

dictionary = Dictionary(tokenized_docs)

query_id = dictionary.token2id.get(query)

print(query_id)

corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_docs]

# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

for c in range(len(corpus)):
    doc = corpus[c]

    # Calculate the tfidf weights of doc: tfidf_weights
    tfidf_weights = tfidf[doc]

    # Sort the weights from highest to lowest: sorted_tfidf_weights
    sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

    # Print the top 5 weighted words
    for term_id, weight in sorted_tfidf_weights[:5]:
        print(dictionary.get(term_id), weight)

166
sprite 345
project 255
students 207
one 201
use 200
term 0.30363562131139155
children 0.21989041548468272
lyrics 0.21131387600677293
cards 0.17655307025663014
ben 0.16590114257146624
kent 0.7048432654054398
browser 0.43000612594000676
navigate 0.2349477551351466
function 0.2246874940412254
activate 0.19950724882364199
default 0.5481848189647693
export 0.3885309975512936
appendix 0.22463887985326347
customizing 0.19075340759297563
replace 0.19075340759297563
can 0.3705215565569278
area 0.32811800505018446
folders 0.3153201892949
johanna 0.3153201892949
grouped 0.28255436916549337
grader 0.23677754981134158
cartesian 0.22604765573969443
matt 0.22604765573969443
obstacles 0.22604765573969443
useful 0.16916493729914725
count 0.5149398188942689
counter 0.30152089338938287
counting 0.27671266575855535
sprites 0.22484906118826756
all 0.1981297935278541
clicked 0.40146866937095854
confusion 0.21590838052908284
snap 0.17825418621241262
consistent 0.1545931507612529
hatblock 0.15459315076125

translate 0.172889061104473
informed 0.1468097488857261
program 0.1407403524318122
java 0.2367341439081099
fish 0.23123916292799535
broadcast 0.16482816463396968
fly 0.15793124490522595
number 0.14268615203775575
webinar 0.2042756903835007
eduteka 0.19501866191532893
week 0.1808754193001679
join 0.16014953913875432
scratched 0.15958406467233327
ability 0.2567428016455887
abilitiy 0.23448227467236812
are 0.23448227467236812
differentiate 0.23448227467236812
mixed 0.23448227467236812
launch 0.3813987205106717
message 0.23414973705895442
block 0.2262055832443629
script 0.19114784207104815
clone 0.18108306620851006
flip 0.3294680061098998
match 0.3294680061098998
card 0.2878366424932541
classic 0.23590328923196383
matching 0.23590328923196383
videos 0.5251454711314975
kunal 0.3825736181223556
vimeo 0.3339444936402568
playlist 0.1912868090611778
community 0.18293323869545208
black 0.25980850450301285
known 0.2136560714565276
reopen 0.20397395836633206
tamika 0.20397395836633206
state 0.1763

In [164]:
from bokeh.layouts import row
from bokeh.plotting import figure, show, output_file

freq_words = [dictionary.get(word_id) for word_id, word_count in sorted_word_count[:5]]
freq_count =  [word_count for word_id, word_count in sorted_word_count[:5]]

# print(freq_words, freq_count)

dot = figure(title="Most Frequent Words", tools="", toolbar_location=None,
            y_range=freq_words, x_range=[0,max(freq_count) + 10])

dot.segment(0, freq_words, freq_count, freq_words, line_width=2, line_color="green", )
dot.circle(freq_count, freq_words, size=15, fill_color="orange", line_color="green", line_width=3, )

show(dot)  # open a browser

['sprite', 'project', 'students', 'one', 'use'] [345, 255, 207, 201, 200]


In [166]:
print(len(results))

99


In [188]:
import operator

posts_relevancy = {}
relevancy = []
count = 0

for key,val in posts.items():
    relevancy_score = 0
    if contains_word(posts[key], query):
        if contains_word(posts[key], freq_words[0]):
            relevancy_score += 5
        if contains_word(posts[key], freq_words[1]):
            relevancy_score += 4
        if contains_word(posts[key], freq_words[2]):
            relevancy_score += 3
        if contains_word(posts[key], freq_words[3]):
            relevancy_score += 2
        if contains_word(posts[key], freq_words[4]):
            relevancy_score += 1
    relevancy.append(relevancy_score)
#     print('Thread ' + str(key) + ' has a relevancy score of ' + str(relevancy_score))
    
for i in range(len(keys)):
    posts_relevancy[keys[i]] = relevancy[i]
    
print(sorted(relevancy, reverse=True))

most_relevant_posts = dict(sorted(posts_relevancy.items(), key=operator.itemgetter(1), reverse=True)[:5])

print(most_relevant_posts)

[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [197]:
result_keys = list(most_relevant_posts.keys())

for i in result_keys:
    print(posts[i][:500])
    print('---')

at this morning's session, we developed a list of questions that we could ask computational creators about their project: what was your inspiration for this project? how did you do that? (about a particular aspect of the project) what did you get stuck on? what are you most proud of? what will you do next with your project? think about a project you've worked on since you arrived -- and respond to one (or several) of these questions in the comments. sincerely, karen, on behalf of the creative co
---
thanks to everyone who volunteered to test the scratch 2.0 alpha. i've sent an email to you* with instructions about how to access the site. if you have any questions/observations/suggestions, please share them here. (if you didn't get in on this round of testing, not to fear, information about the next round will be available in early december.) thanks -- and have fun with the prototype! k * the following scratch usernames have access to the alpha: aaronleemorris, acoll, agarci17, amurphy9