In [8]:
import pandas as pd
import numpy as np

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS, stem
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from pprint import pprint

# Data set from https://www.kaggle.com/therohk/million-headlines/data
data = pd.read_csv('London_hotel_reviews.csv', encoding='latin-1')
data['Review Text']= data['Review Text'].map(str)
data_text = data[['Review Text']]
data_text['index'] = data_text.index
documents = data_text
# print(len(documents))
# print(documents[:5])

np.random.seed(2018)


# Lemmatizing
def lemmatize_stemming(text):
    return stem(WordNetLemmatizer().lemmatize(text, pos='v'))


# Stemming
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


# Testing preprocessing
# doc_sample = documents[documents['index'] == 4310].values[0][0]
# print('original document: ')
# words = []
# for word in doc_sample.split(' '):
#     words.append(word)
# print(words)
# print('\n\n tokenized and lemmatized document: ')
# print(preprocess(doc_sample))

# Preprocessing data
processed_docs = documents['Review Text'].map(preprocess)
# print(processed_docs[:10])

# Bag of Words
dictionary = gensim.corpora.Dictionary(processed_docs)
# count = 0
# for k, v in dictionary.iteritems():
#     print(k, v)
#     count += 1
#     if count > 10:
#         break

# Fliter tokens that appear in more that 15 documents, not more than 50% of documents, and keep first 100000 tokens
# of them
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Dictionary for each document
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# print(bow_corpus[4310])

# Applying TF-IDF
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
# for doc in corpus_tfidf:
#     pprint(doc)
#     break

# Train corpus with LDA
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary,
                                       passes=2, workers=2)
# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

# LDA with TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary,
                                             passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

# Performance evaluation of LDA with Bag of Words
# for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
#     print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

# Performance Evaluation of LDA with TF-IDF
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

# Testing
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Topic: 0 Word: 0.005*"savoi" + 0.004*"servic" + 0.004*"check" + 0.004*"time" + 0.004*"arriv" + 0.004*"great" + 0.004*"love" + 0.004*"book" + 0.004*"night" + 0.003*"go"
Topic: 1 Word: 0.016*"trè" + 0.012*"chambr" + 0.012*"petit" + 0.011*"hôtel" + 0.010*"pour" + 0.010*"nou" + 0.009*"avec" + 0.008*"dan" + 0.007*"bien" + 0.007*"mai"
Topic: 2 Word: 0.007*"good" + 0.007*"walk" + 0.007*"locat" + 0.007*"clean" + 0.007*"breakfast" + 0.007*"nice" + 0.007*"great" + 0.006*"london" + 0.006*"tube" + 0.005*"station"
Topic: 3 Word: 0.016*"sehr" + 0.011*"zimmer" + 0.009*"nicht" + 0.008*"aber" + 0.007*"auch" + 0.006*"ein" + 0.006*"sind" + 0.005*"freundlich" + 0.005*"lage" + 0.004*"frühstück"
Topic: 4 Word: 0.010*"para" + 0.007*"muito" + 0.006*"quarto" + 0.006*"todo" + 0.006*"excelent" + 0.005*"londr" + 0.005*"pero" + 0.005*"café" + 0.004*"desayuno" + 0.004*"manhã"
Topic: 5 Word: 0.006*"servic" + 0.005*"london" + 0.005*"great" + 0.004*"excel" + 0.004*"good" + 0.004*"time" + 0.004*"amaz" + 0.004*"love" + 