# Slack Data Topics

http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/

## Loading Google Word2Vec Model

In [14]:
import gensim

In [2]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin', binary=True)  

In [15]:
print(model)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x10f6c2e10>


## Data Creation and Preprocessing

In [4]:
import json
import os
import re

import numpy as np
import pandas as pd

In [60]:
json_li = []

# directories = ["general", "dev", "ai", "marketresearch", "random", "sales-gamification"]
directory = ["general"]
for directory in directory:
	for filename in os.listdir(directory):
		if filename.endswith(".json"):
			fp = open(os.path.join(directory, filename))
			json_li.append(json.load(fp))

In [61]:
data_fp = open('general_text.txt', 'w+')

In [62]:
for json_obj in json_li:
	for message in json_obj:
		message['text'] = re.sub(r'<(.)*>', '', message['text'], flags=re.MULTILINE)
		data_fp.write(message['text'])
		data_fp.write("\n")
data_fp.close()

## Preprocessing

In [8]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk

In [9]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/JoeSkimmons/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/JoeSkimmons/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
stemmer = SnowballStemmer("english")


def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [63]:
from nltk import word_tokenize

processed_msgs = []

# for l in open('general_text.txt', 'r'):
#     l = l.strip()
#     p = preprocess(l)
#     if p:
#         processed_msgs.append(p)

test_sentence = "This is a test sentence."

for l in open('general_text.txt', 'r'):
    l = l.strip()
    l = l.lower()
    p = word_tokenize(l)
    p = [word for word in p if len(word) > 2]
    if p:
        processed_msgs.append(p)

## Experimentation

In [27]:
from nltk.cluster import KMeansClusterer

In [64]:
vec_list = []

for li in processed_msgs:
    for word in li:
        try:
            vec_list.append(model[word])
        except KeyError as e:
            continue

print(len(vec_list))

11039


In [66]:
# X = model[model.wv.vocab]
X = vec_list

NUM_CLUSTERS=5
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=1)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

In [67]:
from collections import defaultdict

In [68]:
# words = list(model.wv.vocab)
cluster_dict = defaultdict(lambda: [])
for i, word in enumerate(words):
    cluster_dict[assigned_clusters[i]].append(word)

  """Entry point for launching an IPython kernel.


IndexError: list index out of range

In [None]:
for key, word_li in cluster_dict.items():
    print(key)
    print(word_li[:10])

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot

In [None]:
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()