In [23]:
import numpy as np
import pandas as pd
import nltk
import spacy
import string 
import re
import os
import math
import codecs
from sklearn import feature_extraction
import glob

# Data Import


In [3]:
files = glob.glob('./studios/*.csv')
print(files)

['./studios/studio19.csv', './studios/studio18.csv', './studios/studio20.csv', './studios/studio21.csv', './studios/studio8.csv', './studios/studio9.csv', './studios/studio7.csv', './studios/studio6.csv', './studios/studio4.csv', './studios/studio5.csv', './studios/studio1.csv', './studios/studio2.csv', './studios/studio3.csv', './studios/studio10.csv', './studios/studio11.csv', './studios/studio13.csv', './studios/studio12.csv', './studios/studio16.csv', './studios/studio17.csv', './studios/studio15.csv', './studios/studio14.csv']


In [5]:
# All CSV in a dataframe

dfs = []

# iterate over dataframe; create list of dataframes; add studio column
for i,filename in enumerate(files):
    filenum = re.findall(r'\d+',str(filename))
    filenum = int(filenum[0])-1
    df = pd.read_csv(filename, index_col=None, header=0)
    df.insert(0,'studio',filenum)
    dfs.append(df)
    
# concatenate list of dataframes
frame = pd.concat(dfs, axis=0, ignore_index=True, sort=False)

In [6]:
frame.head(10)

Unnamed: 0,studio,id,title,description,instructions,author/id,image,history/created,history/modified,history/shared,...,stats/remixes,remix/parent,remix/root,Unnamed: 16,description (full),instruction (full),description (blank),instruction (blank),description.1,instructions.1
0,18,237042971,Steven and the Stevens [MV],Animation is hard! And time consuming! But aft...,Press the green flag to watch! (Inspired by St...,2745846,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-22T00:59:37.000Z,2018-07-23T09:27:01.000Z,2018-07-23T09:25:28.000Z,...,0,,,,109.0,118.0,18.0,9.0,,
1,18,237100206,Grumpy Bubbles,"Big thanks to @jsh for his Day 1 project, Rive...","First, check out @jsh's River Waltz: https://s...",39526,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-23T00:25:41.000Z,2018-07-23T10:07:59.000Z,2018-07-23T10:07:59.000Z,...,0,235484400.0,235484400.0,,,,,,,
2,18,237093421,Raindrops,I've found that I don't always initially consi...,Press the green flag.,25705937,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-22T22:22:17.000Z,2018-07-23T13:33:50.000Z,2018-07-23T10:37:53.000Z,...,0,,,,,,,,,
3,18,237098671,Night Rain,After simulating rain drops for today's challe...,"Press the green flag, allow for the microphone...",25705937,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-23T00:06:28.000Z,2018-07-23T11:03:43.000Z,2018-07-23T10:38:58.000Z,...,0,,,,,,,,,
4,18,237111696,Lights,This project was created for Day 19 of Getting...,Enjoy the procession of the clones!,56239,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-23T03:16:00.000Z,2018-07-23T17:44:58.000Z,2018-07-23T10:59:22.000Z,...,0,,,,,,,,,
5,18,62591162,Slim Cantore - Penquin Weather Channel Event,\n\n,This project is based on a Weather Story by We...,4615776,https://cdn2.scratch.mit.edu/get_image/project...,2015-05-16T05:27:51.000Z,2018-07-23T11:12:00.000Z,2015-05-21T00:13:38.000Z,...,0,24001065.0,24001065.0,,,,,,,
6,18,237136651,Unstuck Day 19 Flowers,Three flowers on one stem is a variation from ...,Thanks for today's challenge! I have used clon...,14632339,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-23T10:50:19.000Z,2018-07-23T19:44:57.000Z,2018-07-23T11:04:11.000Z,...,0,,,,,,,,,
7,18,237134693,Unstuck Day 19: Using Clones,I did this project for Getting Unstuck challen...,"Play with ripples! \nJust added sound, too.\nS...",214174,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-23T10:11:35.000Z,2018-07-23T12:17:09.000Z,2018-07-23T11:32:06.000Z,...,0,,,,,,,,,
8,18,237138514,Day19,,Click on the green flag to start,34225248,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-23T11:27:54.000Z,2018-07-23T11:38:35.000Z,2018-07-23T11:38:22.000Z,...,0,,,,,,,,,
9,18,237138676,#019 Getting Unstuck Clones,I didn't have a lot of time to do this today.....,Move mouse around for clones...,20126730,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-23T11:31:36.000Z,2018-07-23T11:44:12.000Z,2018-07-23T11:41:27.000Z,...,0,,,,,,,,,


# Data Cleaning


In [7]:
#drop the unnecessary columns
badColumns = ["stats/comments","stats/remixes","Unnamed: 16",
              "description (full)","instruction (full)","description (blank)",
              "instruction (blank)","description.1","instructions.1"]

for badColumn in badColumns:
    frame = frame.drop(badColumn,axis='columns')

In [8]:
# import all_studios CSV to get the author ID

dfAuthor = pd.read_csv('all_studios.csv', index_col=None, header=0)

In [10]:
# Look for the author username; add column to dataframe. 

dfAuthor = dfAuthor.rename(index=str, columns={"project_author_id": "author/id"})

dfAuthor.head()

frame = frame.rename(index=str, columns={"id": "project_id"})
frame.head()

merged_df = pd.merge(frame, dfAuthor, how = 'inner', on = ['project_id','author/id'])

In [11]:
# Merge the two columns of stuff people wrote

merged_df["writing"] = merged_df["description"].map(str) + merged_df["instructions"]

In [12]:
# moving the columns I want to the front.
cols = merged_df.columns.tolist()
print(cols)

cols = ['studio', 'project_id', 'title', 'description', 'instructions', 'writing', 'author/id', 
        'project_author_username','image', 
        'history/created', 'history/modified', 'history/shared', 'stats/views', 'stats/loves', 
        'stats/favorites', 'remix/parent', 'remix/root', 'studio_numberscript_count', 
        'variable_count', 'list_count', 'comment_count', 'costume_count', 'sprite_count', 'block_count', 
        'block_unique_count', 'random_block_count', 'Unnamed: 12', ]

merged_df = merged_df[cols]

merged_df.tail()

['studio', 'project_id', 'title', 'description', 'instructions', 'author/id', 'image', 'history/created', 'history/modified', 'history/shared', 'stats/views', 'stats/loves', 'stats/favorites', 'remix/parent', 'remix/root', 'project_author_username', 'studio_numberscript_count', 'variable_count', 'list_count', 'comment_count', 'costume_count', 'sprite_count', 'block_count', 'block_unique_count', 'random_block_count', 'Unnamed: 12', 'writing']


Unnamed: 0,studio,project_id,title,description,instructions,writing,author/id,project_author_username,image,history/created,...,studio_numberscript_count,variable_count,list_count,comment_count,costume_count,sprite_count,block_count,block_unique_count,random_block_count,Unnamed: 12
3758,13,237860907,Io trovo le lettere,Getting Unstuck 14\nCreare \nCreare un progett...,rispondi,Getting Unstuck 14\nCreare \nCreare un progett...,21462276,ratogi,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-31T13:37:38.000Z,...,14,2,0,0,0,4,1,19,12,1
3759,13,237999490,Getting Unstuck - Day 14,The challenge for day 14 is to create a projec...,Click the green arrow and enter your name when...,The challenge for day 14 is to create a projec...,24696341,GCarganilla,https://cdn2.scratch.mit.edu/get_image/project...,2018-08-01T22:09:01.000Z,...,14,2,0,1,0,6,1,21,13,0
3760,13,237792845,GETTING UNSTUCK14: FIGURES,,"PRESS THE GREEN FLAG\nPLEASE, WRITE IN BLOCK L...","nanPRESS THE GREEN FLAG\nPLEASE, WRITE IN BLOC...",16573108,paolaernesta,https://cdn2.scratch.mit.edu/get_image/project...,2018-07-30T20:32:00.000Z,...,14,23,4,0,0,11,6,77,22,1
3761,13,238433917,day 14,Create a project that uses the string blocks i...,,,24618980,danaespiliadi,https://cdn2.scratch.mit.edu/get_image/project...,2018-08-07T09:55:31.000Z,...,14,3,5,0,0,4,1,52,20,4
3762,13,238271691,Getting Unstuck - Day 14,"When I was younger, I would always create stag...",Press the green button and wait to hear instru...,"When I was younger, I would always create stag...",25334812,heyjessi,https://cdn2.scratch.mit.edu/get_image/project...,2018-08-05T12:55:06.000Z,...,14,1,4,0,0,3,1,42,11,0


In [14]:
# language detection
from langdetect import DetectorFactory
DetectorFactory.seed = 0 

# add a column of the language (highest probability of the language)
from langdetect import detect
merged_df["language"] = merged_df["writing"].apply(lambda x: detect(str(x)))

# Stemming

In [89]:
# English dataframe
# also ignores projects with no reflections
dfEng = merged_df[merged_df['language'] == 'en']
dfEng.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3015 entries, 0 to 3762
Data columns (total 28 columns):
studio                       3015 non-null int64
project_id                   3015 non-null int64
title                        3015 non-null object
description                  2550 non-null object
instructions                 3015 non-null object
writing                      3015 non-null object
author/id                    3015 non-null int64
project_author_username      3015 non-null object
image                        3015 non-null object
history/created              3015 non-null object
history/modified             3015 non-null object
history/shared               3015 non-null object
stats/views                  3015 non-null int64
stats/loves                  3015 non-null int64
stats/favorites              3015 non-null int64
remix/parent                 271 non-null float64
remix/root                   271 non-null float64
studio_numberscript_count    3015 non-null int64


In [90]:
# tokenizing & stemming
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [92]:
texts = dfEng["writing"].dropna().tolist()
len(texts)

3015

In [274]:
# copied from here: https://github.com/brandomr/document_cluster/blob/master/cluster_analysis_web.ipynb
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

# things to filter out
stopwords = nltk.corpus.stopwords.words('english') + list(string.punctuation)
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
username_re = re.compile(r'@([A-Za-z0-9_]+)')
scratch_words = ['green','click','start','flag', 'created','getting','scratch','projects','days','one',
                'sprite','blocks','use','\'s','https', 'project','clicks']

def text_cleaning(doc): 
    # replace return carriage with white space
    doc = doc.replace('\n', ' ')
    # remove usernames
    doc = username_re.sub(r'', doc)
    # remove numbers
    for i in range(10):
        doc = doc.replace(str(i), ' ')
    # remove stop words
    for stop_word in stopwords:
        doc = doc.replace(' ' + stop_word + ' ', ' ')
    # remove scratchwords
    for scratch_word in scratch_words:
        doc = doc.replace(' ' + scratch_word + ' ', ' ')
    # lowercase
    words = doc.split()
    doc = ' '.join([word.lower() for word in words])
    # remove single characters
    doc = [x for x in doc.split() if len(x) > 2]
    doc = " ".join(doc)
    doc = RE_EMOJI.sub(r'', doc)
    return doc

def tokenize_and_stem(text):
    text = text_cleaning(text)
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    text = text_cleaning(text)
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [275]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in texts:
    allwords_stemmed = tokenize_and_stem(i) #for each item, tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [276]:
print(allwords_stemmed[:100])

['when', 'younger', 'would', 'alway', 'creat', 'stage', 'name', 'onlin', 'form', 'want', 'recreat', 'process', 'use', 'scratch', 'found', 'relat', 'simpl', 'know', 'expand', 'option', 'includ', 'input', 'immut', 'fact', 'birthday', 'hair', 'color.press', 'button', 'wait', 'hear', 'instruct', 'roy']


In [287]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

# vocab_frame.head()

there are 113047 items in vocab_frame


Unnamed: 0,words
anim,animation
hard,hard
and,and
time,time
consum,consuming


# TF-IDF

In [278]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000,
                                 min_df=0.1, norm = 'l2', stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(texts) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

  'stop_words.' % sorted(inconsistent))


CPU times: user 8.67 s, sys: 1.25 s, total: 9.92 s
Wall time: 12.8 s
(3015, 40)


In [279]:
terms = tfidf_vectorizer.get_feature_names()
# print(terms)

In [280]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

# K-means

In [281]:
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 7.14 s, sys: 59.4 ms, total: 7.2 s
Wall time: 7.55 s


In [282]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [283]:
dfClusterEng= dfEng[['studio', 'project_id', 'title', 'writing', 'author/id', 
        'project_author_username']]
dfClusterEng = dfClusterEng.assign(clusters = clusters)
dfClusterEng.head()


Unnamed: 0,studio,project_id,title,writing,author/id,project_author_username,clusters
0,18,237042971,Steven and the Stevens [MV],Animation is hard! And time consuming! But aft...,2745846,paulinah,0
1,18,237100206,Grumpy Bubbles,"Big thanks to @jsh for his Day 1 project, Rive...",39526,karenb,6
2,18,237093421,Raindrops,I've found that I don't always initially consi...,25705937,alirb,7
3,18,237098671,Night Rain,After simulating rain drops for today's challe...,25705937,alirb,8
4,18,237111696,Lights,This project was created for Day 19 of Getting...,56239,jsh,9


In [284]:
# len(clusters)

# dfClusterEng.info()

In [285]:
dfClusterEng['clusters'].value_counts()

6    703
9    425
0    392
3    280
4    249
2    234
7    220
1    203
5    181
8    128
Name: clusters, dtype: int64

In [286]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print() #add whitespace
    print() #add whitespace
    
#     print("Cluster %d titles:" % i, end='')
#     for title in dfEng.loc[i]['title'].values.tolist():
#         print(' %s,' % title, end='')
#     print() #add whitespace
#     print() #add whitespace
    
print()

Top terms per cluster:

Cluster 0 words: time, would, like, make, used, getting, create, wanted, different, try,

Cluster 1 words: random, used, block, create, days, change, unstuck, time, try, press,

Cluster 2 words: play, start, block, make, getting, wanted, challenge, fun, create, project,

Cluster 3 words: arrow, move, key, getting, change, make, used, space, game, time,

Cluster 4 words: press, space, key, change, create, make, start, getting, used, see,

Cluster 5 words: game, make, used, time, create, arrow, move, getting, play, space,

Cluster 6 words: used, https, change, see, 's, click, background, project, add, create,

Cluster 7 words: work, time, make, try, used, code, getting, create, wanted, 's,

Cluster 8 words: flag, press, https, make, click, 's, try, getting, change, days,

Cluster 9 words: unstuck, days, unstuck, getting, getting, create, used, challenge, flag, sprites,


