## Doc2Vec Clustering

In [None]:
all_reviews.to_csv('../data/all_reviews.csv')

all_reviews.title = all_reviews.title.map(lambda x: re.sub('[^a-zA-Z0-9 \n\.]', '', x.lower()))
all_reviews.pros = all_reviews.pros.map(lambda x: re.sub('[^a-zA-Z0-9 \n\.]', '', x.lower()))
all_reviews.cons = all_reviews.cons.map(lambda x: re.sub('[^a-zA-Z0-9 \n\.]', '', x.lower()))

In [None]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

stemmed_cons_list = stemmed_cons.apply(lambda x: join_word_list(x))
X = all_reviews['pros'].values.tolist()
X[:3]

documents = []
# input is a list of strings
for i, doc in enumerate(X):
    documents.append(TaggedDocument(words = doc, tags = [i]))
documents[:4]

In [None]:
model = Doc2Vec(vector_size=200, dbow_words= 1, dm=0, epochs=1,  window=5, seed=1337, min_count=30, workers=4,alpha=0.025, min_alpha=0.025)
model.build_vocab(documents)
for epoch in range(10):
    print("epoch "+str(epoch))
    model.train(documents, total_examples=1000, epochs=1)
    model.save('cyber-trend-index-dataset.model')
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

In [None]:
def preprocess(str):
    # remove hyper links
    str = re.sub(r'http(s)?:\/\/\S*? ', "", str)
    return str

# takes string, removes special characters, returns tokenized list
def preprocess_document(text):
    text = preprocess(text)
    return ''.join([x if x.isalnum() or x.isspace() else " " for x in text ]).split()

preprocess_document('hello a asoi athis 920 8 -')

In [None]:
from nltk.cluster import KMeansClusterer
from gensim import models

dataset = [tokenize_sentences(sentence) for sentence in X]
dataset[:3]

# import gensim.downloader as api
# from gensim.models import TfidfModel
# from gensim.corpora import Dictionary

# dct = Dictionary(dataset)  # fit dictionary
# corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format

# tfidf_model = TfidfModel(corpus)  # fit model
# vectors = tfidf_model[corpus]  # apply model to the first corpus document

# import numpy as np
# np.shape(vectors[:1])

In [None]:
vectors = []
for doc in X:
    vectors.append(model.infer_vector(preprocess_document(doc)))

# kclusterer = KMeansClusterer(num_means =10, distance=nltk.cluster.util.cosine_distance, repeats=25)
kclusterer = KMeansClusterer(num_means =15, distance=nltk.cluster.util.cosine_distance)
assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

In [None]:
from collections import Counter

def get_titles_by_cluster(id):
    list = []
    for x in range(0, len(assigned_clusters)):
        if (assigned_clusters[x] == id):
            list.append(X[x])
    return list

def get_topics(titles):
    words = [preprocess_document(x) for x in titles]
    words = [word for sublist in words for word in sublist]
    filtered_words = [word for word in words if word not in (stopwords.words('english')+['work','company'])]
    count = Counter(filtered_words)
    print(count.most_common()[:12])


def cluster_to_topics(id):
    get_topics(get_titles_by_cluster(id))

for i in range(0,15):
    print(cluster_to_topics(i))

## Clustering using TF-IDF vectors

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

print(all_reviews.info())
data = all_reviews

tfidf = TfidfVectorizer(
    min_df = 5,
    max_df = 0.95,
    max_features = 8000,
#     stop_words = (['work', 'company', 'great','good', 'benefits'] + list(stopwords.words('english')))
    stop_words = (['work', 'company'] + list(stopwords.words('english')))
#     stop_words = 'english'
)
tfidf.fit(data.cons)
text = tfidf.transform(data.cons)

In [None]:
def find_optimal_clusters(data, max_k):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')

In [None]:
find_optimal_clusters(text, 30)

clusters = MiniBatchKMeans(n_clusters=20, init_size=1024, batch_size=2048, random_state=20).fit_predict(text)

In [None]:
def plot_tsne_pca(data, labels):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=3000, replace=False)
    
    pca = PCA(n_components=2).fit_transform(data[max_items,:].todense())
    tsne = TSNE().fit_transform(PCA(n_components=50).fit_transform(data[max_items,:].todense()))
    
    
    idx = np.random.choice(range(pca.shape[0]), size=1000, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    ax[0].set_title('PCA Cluster Plot')
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('TSNE Cluster Plot')

In [None]:
plot_tsne_pca(text, clusters)

In [None]:
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
            

### Look at Pros

In [None]:
get_top_keywords(text, clusters, tfidf.get_feature_names(), 12)

np.unique(clusters, return_counts=True)

In [None]:
### Look at Cons

get_top_keywords(text, clusters, tfidf.get_feature_names(), 10)
np.unique(clusters, return_counts=True)

In [None]:
### Calculate highest/lowest % companies per cluster

# clusters array gives us cluster category per order of review in our dataframe
print(len(clusters))
clusters[:20] # preview
data['cluster'] = clusters # make a new column in our OG dataframe

data.head()

In [None]:
# function that displays normalized top/bottom companies per cluster category
def show_company_counts():
    
    # add cluster column to DF
    data['cluster'] = clusters
    
    # multiindex series with review counts per company per cluster
    grouped = data.groupby(['cluster','company']).title.count()
    
    # series of total reviews per company
    total_counts = data.groupby('company').title.count()
    
    # multiindex DF resulting from joining above series together
    final_grouped = pd.merge(grouped.reset_index(), total_counts.reset_index(), on=['company'], how='inner').set_index(['cluster','company'])
    
    # adding a column for normalized reviews: reviews per cluster/total number of reviews
    final_grouped['normalized'] = final_grouped['title_x']/final_grouped['title_y']
    
    # return DF sorted by cluster and normalized value
    return final_grouped.sort_values(by=['cluster','normalized'], ascending = [True,False])

show_company_counts()

In [None]:
def show_top_companies(cluster_array):
    df = pd.DataFrame(columns = np.unique(cluster_array))

## NMF Modeling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

vectorizer = CountVectorizer(analyzer = "word", max_features = 10_000)

pros_remove = all_reviews.pros.tolist()
pros_remove_tokenized = [tokenize_sentences(text) for text in pros_remove]
pros_remove_stopwords = [remove_stopwords(token) for token in pros_remove_tokenized]

pros_joined = [' '.join(text) for text in pros_remove_stopwords]

pros_joined = pros_joined
word_counts = vectorizer.fit_transform(pros_joined)

tfidf_transform = TfidfTransformer(smooth_idf = False)

words_tfidf = tfidf_transform.fit_transform(word_counts)

In [None]:
# final_words = normalize(words_tfidf, norm = 'l1')

model = NMF(n_components = 15, init = 'nndsvd')

W = model.fit_transform(words_tfidf)

W.shape

H = model.components_

H.shape

lst = []

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        lst.append(message)
    return lst

topics_nmf = (print_top_words(model, vectorizer.get_feature_names(), 10))



nmfsplit = [item.split() for item in topics_nmf]

pd.DataFrame(nmfsplit)

In [None]:
### Look at Cons

vectorizer = CountVectorizer(analyzer = "word", max_features = 10_000)

cons_remove = all_reviews.cons.tolist()
cons_remove_tokenized = [tokenize_sentences(text) for text in cons_remove]
cons_remove_stopwords = [remove_stopwords(token) for token in cons_remove_tokenized]

cons_joined = [' '.join(text) for text in cons_remove_stopwords]

cons_joined = cons_joined
word_counts = vectorizer.fit_transform(cons_joined)

tfidf_transform = TfidfTransformer(smooth_idf = False)

words_tfidf = tfidf_transform.fit_transform(word_counts)

In [None]:
model = NMF(n_components = 15, init = 'nndsvd')

W = model.fit_transform(words_tfidf)

W.shape

H = model.components_

H.shape

lst = []

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        lst.append(message)
    return lst

topics_nmf = (print_top_words(model, vectorizer.get_feature_names(), 10))



nmfsplit = [item.split() for item in topics_nmf]

pd.DataFrame(nmfsplit)

all_reviews.groupby('company').count()