In [18]:
from time import time
import pickle
from nltk import tokenize
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
n_samples = 2000
n_features = 1000
n_components = 7
n_top_words = 30


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

In [3]:
def split_to_sentences(paragraph):
    return tokenize.sent_tokenize(paragraph)

def clean_data(text):
    text = text.lower().strip()
    text = text.strip(".")
    # Remove double space
    # Trim input?
    return text

def extract_text_blocks(review):
    """
    This functions splits the string into sentence, then does basic text cleansing
    and flattens the list of sentences.
    - review: Review in the form of a string
    Returns :
    - flat_list: A flattened list of sentences 
    """
    sentences = split_to_sentences(review)
    #flat_list = [clean_data(block) for sublist in text_block for block in sublist]
    #return flat_list
    return sentences

In [4]:
data_pros_cons = pickle.load( open( "gdr_assignment_pros_cons.pkl", "rb" ))

pro_reviews = data_pros_cons.iloc[:-1,0].tolist()
cons_reviews = data_pros_cons.iloc[:-1,1].tolist()

sentence_samples = []
for pro, cons in zip(pro_reviews, cons_reviews):
    sentence_samples.append(extract_text_blocks(pro))
    #sentence_samples.append(extract_text_blocks(cons))
    
reviews = [val for sublist in sentence_samples for val in sublist]

# LDA

In [5]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(reviews)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.133s.



In [6]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (len(reviews), n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=500,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=0,
                                doc_topic_prior= None,
                                topic_word_prior= None)
t0 = time()
lda.fit(tf)

X=lda.transform(tf)

print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=10708 and n_features=1000...
done in 401.874s.

Topics in LDA model:
Topic #0: work great people environment culture working fun place company smart friendly team atmosphere love amazing nice really hard coworkers awesome interesting workers best talented lots management supportive positive animals events
Topic #1: time lot work opportunity day best learn lots ve people days paid lunch able meet companies make year job long ll available friends use plenty different week vacation leave going
Topic #2: work free life food balance hours flexible great home employee schedule worked snacks perks gym high training quality manager tech store years discount discounts drinks shift office coffee area site
Topic #3: great office benefits product products company amazing perks colleagues nice technology leadership industry health culture apple games strong insurance vision super cool mission new employees stock support offices talent location
Topic #4

In [28]:
for i in range(10):
    print("Review: %s, Topic: %d, Confidence: %f" % (reviews[i], np.argmax(X[i]), np.max(X[i])))

Review: Company and culture are great!, Topic: 0, Confidence: 0.785394
Review: Company culture, autonomy, global strategy, teamwork, general awesomeness., Topic: 4, Confidence: 0.290784
Review: Not micromanaged., Topic: 0, Confidence: 0.142857
Review: Co workers were good., Topic: 5, Confidence: 0.380955
Review: Pay was good., Topic: 5, Confidence: 0.714286
Review: Travel was well managed by company., Topic: 6, Confidence: 0.535534
Review: Some of their products work for the customer, Topic: 3, Confidence: 0.285715
Review: Great work environment, fast paced, hard working and perfect for learning and developing your skills., Topic: 0, Confidence: 0.489182
Review: I love working with dogs and people., Topic: 0, Confidence: 0.613528
Review: I enjoy interacting with people and their pets., Topic: 0, Confidence: 0.785427
