## Text Clustering (Exploration 5)

### Goals of this exploration
* Text clustering using NLTK and SciKit-Learn

In [87]:
# This is to prepare environment and import libraries
%matplotlib inline
import numpy as np
import pandas as pd
from pylab import *
import matplotlib
import matplotlib.pyplot as plt
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer
import random
import pickle
from wordcloud import WordCloud, STOPWORDS
from matplotlib.dates import date2num

In [88]:
import string
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

In [94]:
# Load saved tagged reviews, prepared in "Exploration 3" if available
try:
    tagged_review_file = open('taggedDigitSoftwareReviews', "rb")
    reviews = pickle.load(tagged_review_file)
    tagged_review_file.close()
except:
    reviews=pd.read_csv('data/raw/amazon_reviews_us_Digital_Software_v1_00.tsv',delimiter='\t', encoding='latin-1')

In [101]:
articles=[]
for r in reviews.review_body[:100]:  # Prototyping with small sample size of 100
    articles.append(r)

In [102]:
# Source: https://nlpforhackers.io/recipe-text-clustering/
#
# Since it's using km_model.fit instead of km_model.fit_predict , the context is for supervised learning 
# Can be applied to case such as https://www.youtube.com/watch?v=RZYjsw6P4nI
#
def process_text(text, stem=True):
    """ Tokenize text and stem words removing punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
 
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
 
    return tokens
 
    
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
    #km_model.fit_predict(tfidf_model)
    
    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering
 
 


In [103]:
clusters = cluster_texts(articles, 7)

In [99]:
print(clusters)

defaultdict(<class 'list'>, {6: [0, 32, 41, 43, 56, 74, 82, 83], 3: [1, 6, 22, 29, 44, 45, 52, 76, 77, 97, 98], 2: [2, 8, 11, 16, 20, 25, 26, 33, 35, 38, 40, 46, 47, 49, 53, 54, 57, 60, 62, 64, 67, 72, 78, 79, 80, 84, 86, 87, 88, 92, 94, 96, 99], 5: [3, 5, 10, 15, 17, 21, 50, 55, 63, 89], 0: [4, 7, 12, 18, 27, 30, 31, 34, 36, 37, 39, 42, 48, 58, 65, 68, 70, 75, 85, 91, 93], 4: [9, 14, 19, 24, 28, 51, 61, 71, 73, 90], 1: [13, 23, 59, 66, 69, 81, 95]})
