## Data Cleaning

In [None]:
import csv
import os
import shutil

In [None]:
# creates a folder for all text files

path = '/Users/erincarvalho/Desktop/dev/final-project-Erin-c'
if os.path.isdir(path + '/txt_files'):
    shutil.rmtree(path + '/txt_files', ignore_errors=False, onerror=None)
os.mkdir(path + '/txt_files')

In [None]:
# creates a separate text file for each topic with all posts and replies from csv
# ScratchEd_all_data.csv

ids = []

with open('ScratchEd_all_data.csv', "r", encoding='utf-8', errors='ignore') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    count = 0

    for idx, row in enumerate(csv_reader):   
        if str(row[0]) in ids:
            filename = path + '/txt_files/topic_' + str(row[0]) + '.txt'
            file = open(filename,'a+')
            contents = str(row[3]) + '\r\n' + '\r\n'
            file.write(contents)
        else:
            filename = path + '/txt_files/topic_' + str(row[0]) + '.txt'
            file = open(filename,'a+')
            contents = str(row[3]) + '\r\n' + '\r\n'
            file.write(contents)
            ids.append(str(row[0]))
            count += 1
    print(count)
    print(len(ids))

In [None]:
import glob

# save all the text files in a list

threads = glob.glob('./txt_files/*.txt')
print(len(threads))

In [None]:
documents = []

# load actual text into a list

for thread in threads: 
    with open (thread, "r", encoding='utf-8', errors='ignore') as t:
        documents.append(t.read())
        
# convert text to all lowercase

for i, t in enumerate(threads):
    documents[i] = documents[i].lower()

In [None]:
punctuation = ['.', '...', '!', '#', '"', '%', '$', "'", '&', ')', 
               '(', '+', '*', '-', ',', '/', '.', ';', ':', '=', 
               '<', '?', '>', '@', '",', '".', '[', ']', '\\', ',',
               '_', '^', '`', '{', '}', '|', '~', '−', '”', '“', '’']

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 
              'ourselves', 'you', 'your', 'yours', 'yourself', 
              'yourselves', 'he', 'him', 'his', 'himself', 'she', 
              'her', 'hers', 'herself', 'it', 'its', 'itself', 
              'they', 'them', 'their', 'theirs', 'themselves', 
              'what', 'which', 'who', 'whom', 'this', 'that', 
              'these', 'those', 'am', 'is', 'are', 'was', 'were', 
              'be', 'been', 'being', 'have', 'has', 'had', 'having', 
              'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 
              'but', 'if', 'or', 'because', 'as', 'until', 'while', 
              'of', 'at', 'by', 'for', 'with', 'about', 'against', 
              'between', 'into', 'through', 'during', 'before', 
              'after', 'above', 'below', 'to', 'from', 'up', 'down', 
              'in', 'out', 'on', 'off', 'over', 'under', 'again', 
              'further', 'then', 'once', 'here', 'there', 'when', 
              'where', 'why', 'how', 'all', 'any', 'both', 'each', 
              'few', 'more', 'most', 'other', 'some', 'such', 'no', 
              'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
              'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 
              'now', 'http', 'https', 'edu', 'www', 'com', 'scratch', 
              'mit', 'org', 'would', 'should', 'could', 'might', 'really', 
              'very', 'good', 'great', 'best', 'karen', '྾explore',
              '྾interact', '྾network', 'get', 'also', 'let', 'much', 'use', 
              'les', 'ver', 'post', 'est', 'oscar', 'con', 'las', 'para',
              'student', 'projects', 'january', 'february', 'march', 'april',
              'may', 'june', 'july', 'august', 'september', 'october', 
              'november','december']

In [None]:
def clean_list_of_documents(documents):
    '''cleans a list of documents'''
    
    cleaned_docs = []
    
    for i,doc in enumerate(documents):
        # removes new lines and carriage returns
        doc = doc.replace('\n', ' ')
        doc = doc.replace('\r', ' ')
        # remove ponctuation
        for punc in punctuation: 
            doc = doc.replace(punc, ' ')
        # remove numbers
        for i in range(10):
            doc = doc.replace(str(i), ' ')
        # remove stop words
        for stop_word in stop_words:
            doc = doc.replace(' ' + stop_word + ' ', ' ')
        # remove single characters and stem the words 
        doc = [x for x in doc.split() if len(x) > 2]
        doc = " ".join(doc)
        # save the result to our list of documents
        cleaned_docs.append(doc)
        
    return cleaned_docs

In [None]:
# Print the first bit of the document for sanity

clean_docs = clean_list_of_documents(documents)

print(clean_docs[0][:100])

## Vocabulary

In [None]:
# !pip3 install nltk
# !nltk.download("wordnet", "./")

import math
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import Counter, defaultdict

In [None]:
def get_vocabulary(documents):
    '''builds a vocabulary'''

    lemmatized_vocabulary = []
    lemmatizer = WordNetLemmatizer()

    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    for document in clean_docs:
        tokens = word_tokenize(document)
        for word, tag in pos_tag(tokens):
            word = lemmatizer.lemmatize(word, tag_map[tag[0]])
            if wn.synsets(word):
                if word not in lemmatized_vocabulary: 
                    lemmatized_vocabulary.append(word)

    lemmatized_vocabulary = list(set(lemmatized_vocabulary))
    lemmatized_vocabulary.sort()

    return lemmatized_vocabulary

In [None]:
vocabulary = get_vocabulary(clean_docs)
print(len(vocabulary))

In [None]:
def flatten_and_overlap(documents, window_size=100, overlap=25):
    
    # create the list of overlapping documents
    new_list_of_documents = []
    
    # flatten everything into one string
    flat = ""
    for document in documents:
        flat += document
    
    # split into words
    flat = flat.split()

    # create chunks of 100 words
    high = window_size
    while high < len(flat):
        low = high - window_size
        new_list_of_documents.append(flat[low:high])
        high += overlap
    return new_list_of_documents

In [None]:
chunks = flatten_and_overlap(clean_docs)

In [None]:
import pandas as pd
df = pd.DataFrame(0, index=np.arange(len(chunks)), columns=vocabulary)
df.info()

In [None]:
def docs_by_words_df(chunks, vocabulary):
    df = pd.DataFrame(0, index=np.arange(len(chunks)), columns=vocabulary)
    
    # fill out the matrix with counts
    for i,chunk in enumerate(chunks):
        for word in chunk:
            if word in df.columns: 
                df.loc[i,word] += 1
            
    return df

In [None]:
print(chunks[0])
print(len(chunks))

In [None]:
df = docs_by_words_df(chunks, vocabulary)
df.loc[0,'school']

In [None]:
def one_plus_log(cell):
    if cell != 0: 
        return 1 + math.log(cell)
    else:
        return 0

In [None]:
df_log = df.applymap(one_plus_log)

In [None]:
def one_plus_log_mat(df):
    df = df.applymap(one_plus_log)
    return df.values

In [None]:
print("before one + log: ", df.loc[0,'school'])
print("after one + log: ", 1 + math.log(df.loc[0,'school']))
print("Value in the dataframe: ", df_log.loc[0,'school'])

In [None]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
df_log[df_log.columns] = scaler.fit_transform(df_log[df_log.columns])
df_log[df_log.columns[100:600]]

In [None]:
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

def normalize_df(df, method='Normalizer'):
    
    # choose the normalization strategy
    scaler = None
    if method == 'Normalizer': scaler = Normalizer()
    if method == 'MinMaxScaler': scaler = MinMaxScaler()
    if method == 'StandardScaler': scaler = StandardScaler()
        
    # apply the normalization
    if scaler != None:
        df[df.columns] = scaler.fit_transform(df[df.columns])

    # return the resulting dataframe
    return df

In [None]:
v_sum = np.sum(df_log.values, axis=0)

In [None]:
def vector_length(u):
    return np.sqrt(np.dot(u, u))

def length_norm(u):
    return u / vector_length(u)

v_avg = length_norm(v_sum)

In [None]:
matrix = df_log.values

for row in range(df_log.shape[0]):

    # this is one vector (row
    v_i = matrix[row,:]

    # we subtract its component along v_average
    scalar = np.dot(v_i,v_avg)
    sub = v_avg * scalar

    # we replace the row by the deviation vector
    matrix[row,:] = length_norm(v_i - sub)

In [None]:
def vector_length(u):
    return np.sqrt(np.dot(u, u))

def length_norm(u):
    return u / vector_length(u)

def transform_deviation_vectors(df):
    
    # get the numpy matrix from the df
    matrix = df.values
    
    # compute the sum of the vectors
    v_sum = np.sum(matrix, axis=0)
    
    # normalize this vector (find its average)
    v_avg = length_norm(v_sum)
    
    # we iterate through each vector
    for row in range(df_log.shape[0]):
        
        # this is one vector (row
        v_i = matrix[row,:]
        
        # we subtract its component along v_average
        scalar = np.dot(v_i,v_avg)
        sub = v_avg * scalar
        
        # we replace the row by the deviation vector
        matrix[row,:] = length_norm(v_i - sub)
    
    return df

In [None]:
df = transform_deviation_vectors(df_log)

In [None]:
import collections
from sklearn.cluster import KMeans
kmeans_obj = KMeans(n_clusters=10, max_iter=1000).fit(df.values)

n_words = 10
top_words = collections.defaultdict(lambda: [])

# iterate through each cluster
for n in range(kmeans_obj.n_clusters):

    print('CLUSTER ' + str(n+1) + ': ', end='')

    # get the cluster centers
    arr = kmeans_obj.cluster_centers_[n]

    # sorts the array and keep the last n words
    indices = arr.argsort()[-n_words:]

    # add the words to the list of words
    for i in indices:
        print(vocabulary[i], end=', ')
        top_words[n].append(vocabulary[i])
        
    print('')

In [None]:
from sklearn.cluster import AgglomerativeClustering

ward = AgglomerativeClustering(n_clusters=10, linkage='ward').fit(df.values)
label = ward.labels_

print("Number of points: %i" % label.size)

In [None]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
import numpy as np

clf = NearestCentroid()
clf.fit(df.values, label)

print(clf.centroids_.shape)

In [None]:
def visualize_clusters(df, n_clusters, centroids, n_words=10, printed=True):   
    # try to get the most informative words of each cluster
    words = {}
    vocabulary = df.columns
    for n in range(n_clusters):
        words[n] = []
        if printed: print('CLUSTER ' + str(n+1) + ': ', end='')
        arr = centroids[n]
        indices = arr.argsort()[-n_words:]
        for i in indices:
            if printed: print(vocabulary[i], end=', '),
            words[n].append(vocabulary[i])
        print('')
    return words

top_words = visualize_clusters(df, clf.centroids_.shape[0], clf.centroids_)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df[df.columns] = scaler.fit_transform(df[df.columns])

# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(df.values)

# Transform the articles: nmf_features
nmf_features = model.transform(df.values)

# Print the NMF features
print(nmf_features)

In [None]:
import pandas as pd

# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=df.columns)

for i in range(6):

    # Select row 3: component
    component = components_df.iloc[i,:]

    # Print result of nlargest
    print(component.nlargest(n=10), '\n')

In [None]:
from sklearn.cluster import KMeans

In [None]:
import collections

def get_top_words(kmeans, centers, n_words=10):
    
    top_words = collections.defaultdict(lambda: [])

    # iterate through each cluster
    for n in range(kmeans.n_clusters):

        # get the cluster centers
        arr = centers[n]

        # sorts the array and keep the last n words
        indices = arr.argsort()[-n_words:]

        # add the words to the list of words
        for i in indices:
            top_words[n].append(vocabulary[i])
    
    return top_words

In [None]:
top_10_clusters = get_top_words(kmeans_obj, clf.centroids_)
print(top_10_clusters[0])

In [None]:
from bokeh.palettes import Category10

colors = Category10[10]

In [None]:
from IPython.core.display import HTML

html_text = ""


for i in range(0,kmeans_obj.n_clusters):
    words=', '.join(top_words[i])
    color = colors[i]
    text = "<p>Cluster X: <font color='"+color+"'>"+words+"</font></p>"
    html_text += text
    
HTML(html_text)

In [None]:
indices =  list(range(0, len(chunks)))

In [None]:
list_of_chunks = [' '.join(chunks[i]) for i in indices]

In [None]:
labels = [kmeans_obj.labels_[i] for i in indices]

In [None]:
palette = [colors[labels[i]] for i in indices]

In [None]:
doc_id = []
current_doc = 0
next_doc = 1

# we go through all the chunks 
for chunk in list_of_chunks:
    next_doc = current_doc + 1
    if next_doc == len(clean_docs):
        doc_id.append(current_doc)
    else:
        if chunk in clean_docs[next_doc]:
            current_doc += 1
        doc_id.append(current_doc)

In [None]:
print(len(indices))
print(len(list_of_chunks))
print(len(labels))
print(len(doc_id))
print(len(palette))

In [None]:
master = {'indices': indices,
          'chunk': list_of_chunks, 
          'cluster': labels,
          'document': doc_id, 
          'palette': palette }

In [None]:
master_df = pd.DataFrame.from_dict(master)

master_df.head(10)

In [None]:
from bokeh.plotting import ColumnDataSource, figure, show, output_file
from bokeh.io import output_notebook, curdoc
from bokeh.models import HoverTool, Select, Slider
from bokeh.layouts import row, column

source = ColumnDataSource(master_df)

# Create a figure with the "box_select" tool: p
p = figure(tools='box_select',x_axis_label='indices',y_axis_label='document')

# Add circle glyphs to the figure p
p.circle('indices','document', source=source, color='green', size=8)

# Specify the name of the output file and show the result
output_file('output.html')
show(p)


In [None]:
source = ColumnDataSource(master_df)
p = figure(tools='box_select',x_axis_label='indices',y_axis_label='document')
p.circle('indices','cluster', source=source, color='palette', size=8)
output_file('output.html')
show(p)


In [None]:
# Create a HoverTool: hover
hover = HoverTool(tooltips=[('chunk', '@chunk')], mode='vline')

# Add hover tool to p
p.add_tools(hover)

# Show the new output with the hover tool
output_file('output.html')
show(p)

In [None]:
def visualize_clusters(results_clustering, top_words, vocabulary):
    text = ""

    for cluster, words in top_words.items(): 
        words = " ".join(words)
        color = colors[cluster]
        text += "<p>Cluster "+str(cluster)+": <font color='"+color+"'>"+words+"</font></p>"

    return text

In [None]:
def ExtractTopicsVSM(documents, numTopics):
    ''' this functions takes in a list of documents (strings), 
        runs topic modeling (as implemented by Sherin, 2013)
        and returns the clustering results, the matrix used 
        for clustering a visualization '''
    
    # step 2: clean up the documents
    documents = clean_list_of_documents(documents)
    
    # step 3: let's build the vocabulary of these docs
    vocabulary = get_vocabulary(documents)
    
    # step 4: we build our list of 100-words overlapping fragments
    documents = flatten_and_overlap(documents)
    
    # step 5: we convert the chunks into a matrix
    df = docs_by_words_df(documents, vocabulary)
    
    # step 6: we weight the frequency of words (count = 1 + log(count))
    df.values = one_plus_log_mat(df)
    
    # step 7: we normalize the matrix
    df.values = normalize_df(df, method='Normalizer')
    
    # step 8: we compute deviatio vectors
    df = transform_deviation_vectors(df)
    
    # step 9: we apply a clustering algorithm to find topics
    results_clustering = KMeans(n_clusters=numTopics, max_iter=1000).fit(df.values)
    
    # step 10: we get the top words for each cluster
    top_words = get_top_words(results_clustering, results_clustering.cluster_centers_)
    
    # step 11: we create a visualization for the topics
    visualization = visualize_clusters(results_clustering, top_words, vocabulary)
    
    # finally, we return the clustering results, the matrix, and a visualization
    return results_clustering, df, top_words, visualization

In [None]:
posts = {}
keys = []
values = []

for thread in threads:
    keys.append(thread[18:-4])

for document in documents:
    values.append(document)

for i in range(len(keys)):
    posts[keys[i]] = values[i]

print(posts)

In [None]:
def contains_word(string, word):
    return (' ' + word + ' ') in (' ' + string + ' ')

query = input('What are you searching for? ')

results = []

counter = 0

for key,val in posts.items():
    if contains_word(posts[key], query):
        counter += 1
        #print(val)
        results.append(val)
        # print(key, val)
print('counter is ' + str(counter))

clean_results = clean_list_of_documents(results)
result_vocabulary = get_vocabulary(clean_results)

print(clean_results, result_vocabulary)
print(len(clean_results))

In [None]:
# !pip3 install gensim
from collections import defaultdict
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
import itertools


tokenized_docs = [word_tokenize(doc) for doc in clean_results]

dictionary = Dictionary(tokenized_docs)

query_id = dictionary.token2id.get(query)

print(query_id)

corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_docs]

total_word_count = defaultdict(int)

for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

for c in range(len(corpus)):
    doc = corpus[c]

    # Calculate the tfidf weights of doc: tfidf_weights
    tfidf_weights = tfidf[doc]

    # Sort the weights from highest to lowest: sorted_tfidf_weights
    sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

    # Print the top 5 weighted words
    for term_id, weight in sorted_tfidf_weights[:5]:
        print(dictionary.get(term_id), weight)

In [None]:
from bokeh.layouts import row
from bokeh.plotting import figure, show, output_file

freq_words = [dictionary.get(word_id) for word_id, word_count in sorted_word_count[:5]]
freq_count =  [word_count for word_id, word_count in sorted_word_count[:5]]

# print(freq_words, freq_count)

dot = figure(title="Most Frequent Words", tools="", toolbar_location=None,
            y_range=freq_words, x_range=[0,max(freq_count) + 10])

dot.segment(0, freq_words, freq_count, freq_words, line_width=2, line_color="green", )
dot.circle(freq_count, freq_words, size=15, fill_color="orange", line_color="green", line_width=3, )

output_file('frequency.html')
show(dot)  # open a browser

In [None]:
print(len(results))

In [None]:
import operator

posts_relevancy = {}
relevancy = []
count = 0

for key,val in posts.items():
    relevancy_score = 0
    if contains_word(posts[key], query):
        if contains_word(posts[key], freq_words[0]):
            relevancy_score += 5
        if contains_word(posts[key], freq_words[1]):
            relevancy_score += 4
        if contains_word(posts[key], freq_words[2]):
            relevancy_score += 3
        if contains_word(posts[key], freq_words[3]):
            relevancy_score += 2
        if contains_word(posts[key], freq_words[4]):
            relevancy_score += 1
    relevancy.append(relevancy_score)
#     print('Thread ' + str(key) + ' has a relevancy score of ' + str(relevancy_score))
    
for i in range(len(keys)):
    posts_relevancy[keys[i]] = relevancy[i]
    
print(sorted(relevancy, reverse=True))

most_relevant_posts = dict(sorted(posts_relevancy.items(), key=operator.itemgetter(1), reverse=True)[:5])

print(most_relevant_posts)

In [None]:
result_keys = list(most_relevant_posts.keys())

for i in result_keys:
    print(posts[i][:500])
    print('---')