In [8]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim import corpora, models

In [2]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [3]:
n_features = 1000
n_topics = 100
n_top_words = 20

In [4]:
def review_to_words( raw_review, stops ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    #review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split() 
    
    #stem words
    lemmatizer=WordNetLemmatizer()
    stemmedWords=[lemmatizer.lemmatize(word) for word in words]
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    #stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in stemmedWords if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [5]:
stops = set(stopwords.words("english"))

In [6]:
f=open("Data/comments.txt",'r')
train = f.readlines()
f.close()

In [9]:
clean_review = review_to_words( train[0],stops )
print clean_review

forced induction either standard option maybe nismo trim level


In [11]:
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in train:
    # Call our function for each one, and add the result to the list of
    # clean reviews
    if len(i)>3:
        #print(i)
        clean_train_reviews.append( review_to_words( i,stops ) )

In [12]:
clean_train_reviews[1]

u'exterior look overall build quality vehicle handling breaking engine output'

In [13]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word", max_features = n_features) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
#train_data_features = train_data_features.toarray()

In [15]:
#fit LDA model to data
lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', learning_offset=50., random_state=0)

lda.fit(train_data_features)

print("\nTopics in LDA model:")
tf_feature_names = vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
seat front passenger doe different low heated head rest adjustment belt move setting adjust arm height forward middle sit reach
Topic #1:
zero flat front friendly friend free fourth four found forward ford foot fold fog floor fixed favorite fix fit first
Topic #2:
zero flat front friendly friend free fourth four found forward ford foot fold fog floor fixed favorite fix fit first
Topic #3:
door people awesome etc car someone quite lock parking hold away garage ask park scratch sharp unlock noticed opener often
Topic #4:
altima reliable far sentra car infiniti driven rd toyota impressed traded affordable question economical son armada currently titan owned nissan
Topic #5:
rogue value ever drove absolutely test gotten smoothness totally rough bmw changing mercedes car ride love look competitor lane drive
Topic #6:
zero flat front friendly friend free fourth four found forward ford foot fold fog floor fixed favorite fix fit first
Topic #7:
reliability muran

In [None]:
print(transformed_data[0])

In [None]:
%pylab inline

In [None]:
import matplotlib.pyplot as plt
plt.scatter(transformed_data[:,0],transformed_data[:,1])

In [None]:
print(transformed_data[:10,0])

In [None]:
type(transformed_data)