In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(stopwords.words('spanish'))
stop_words.extend(stopwords.words('french'))
newStopWords=['area','flat_screen','comfortable','place','large','city','good','beautiful',
              'comfort','modern','great','family','holiday','new','offer','experience','people',
              'perfect','popular','mountain','luxury','small','spacious','ideal',
              'stay','high']
stop_words.extend(newStopWords)
print(len(stop_words))

675


In [3]:
df = pd.read_csv('acc_type_dataset.csv',sep=';',index_col=0)
items = df['basename'].tolist()
df.head()

Unnamed: 0,item_id,at,basename,web,hotel_chain
1,2147326,2,Coral Tree,www.goldfinchhotels.com,0
2,1828547,2,Amsterdam,http://amsterdam.bg/,0
3,5742186,2,Tiradentes,http://hosteltiradentes.wixsite.com/tiradentes,0
4,2021209,2,Garni G Hotel Bratislava,http://www.doprastavservices.sk/g_h_ba/g_h_ba....,0
5,1350676,2,Château De Pourtales,www.chateau-pourtales.eu,0


In [4]:
df = pd.read_csv('Final sample Gettech.csv',sep=';',index_col=0)
df_test = df.groupby(['accommodation_id','basename','at','description','value_type_id'])['amenities_id'].apply(list)
df_test = df_test.to_frame().reset_index()
df_test2 = df.groupby(['accommodation_id','basename','at','description','value_type_id'])['amenities_cont'].apply(list)
df_test2 = df_test2.to_frame().reset_index()
final_df = pd.merge(df_test, df_test2[["accommodation_id", "amenities_cont"]], on="accommodation_id", how="left")
final_df.to_csv('processed_csv_file.csv', sep='\t', encoding='utf-8')
final_df.head()

Unnamed: 0,accommodation_id,basename,at,description,value_type_id,amenities_id,amenities_cont
0,5088,Hotel Sonne,2,"Set in a prime location of Waldbronn, Hotel So...",1,[47],[1]
1,5092,Steuermann,2,Ideally located in the prime touristic area of...,1,"[63, 47]","[1, 1]"
2,5323,Hotel Montree,2,Hotel Montree is conveniently located in the p...,1,"[63, 48]","[1, 1]"
3,5434,Weidenhof,2,The Weidenhof is a well-situated Hotel in Rege...,1,[48],[1]
4,5668,East-Side,3,This guest house is located in Berlin’s trendy...,1,[255],[1]


In [5]:
item_names = final_df['basename'].tolist()
item_descriptions = final_df['description'].tolist()
item_at = final_df['at'].tolist()

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(item_descriptions))
print(data_words)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)






In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [8]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
nlp = spacy.load('en', disable=['parser', 'ner'])


data_words_bigrams = make_bigrams(data_words_nostops)
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ'])

print(data_lemmatized[:1])

[['prime_location', 'waldbronn', 'hotel', 'sonne', 'puts_everything']]


In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])
#print(id2word[1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=2, random_state=100,update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis