In [1]:
#Importación de librerías
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy import displacy
from nltk.stem import SnowballStemmer

import warnings
warnings.filterwarnings('ignore')

nlp = spacy.load("es_core_news_lg")
stopwords = nltk.corpus.stopwords.words('english')

In [2]:
datos = pd.read_csv(r"/Users/juan/Documents/Juan's MacBook Pro/CUNEF/Quinto/Informacion no Estructurada/Ejercicios Clase/topic_modelling/trip_advisor_en/tripadvisor_hotel_reviews.csv")
datos.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [3]:
sentences = datos['Review'].head(2000)

### Analisis de Sentimientos

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize

In [5]:
def anal_sent(sentences):
    analizador = SentimentIntensityAnalyzer()
    lista = []
    for sentence in sentences:
        lista.append(analizador.polarity_scores(sentence))
    
    dataframe = pd.DataFrame(lista)
    return dataframe.mean()

In [6]:
datos4 = datos[datos['Rating'] == 4]
sentences4 = datos4['Review'].head(2000)

In [8]:
datos5 = datos[datos['Rating'] == 5]
sentences5 = datos5['Review'].head(2000)

In [11]:
anal_sent(sentences4)

neg         0.061382
neu         0.613595
pos         0.325020
compound    0.933231
dtype: float64

In [10]:
anal_sent(sentences5)

neg         0.048274
neu         0.589832
pos         0.361876
compound    0.953878
dtype: float64

### Modelado de topicos

In [12]:
tripadvisor = datos.drop(columns=['Rating'], axis=1).head(2000)

In [13]:
tripadvisor

Unnamed: 0,Review
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso..."
...,...
1995,"best location beach, husband traveled honolulu..."
1996,awesome place stay outriiger beach great place...
1997,"great time, just returned trip stayed 7 nights..."
1998,best location value properties waikiki head ho...


In [14]:
print("Stemms - NLTK")
english_stemmer = SnowballStemmer('english')

Stemms - NLTK


In [15]:
trip_tok = []
for frase in tripadvisor['Review']:
    trip_tok.append(nltk.word_tokenize(frase.lower()))

stems = []    

for frase in trip_tok:
    for palabra in frase:
        result = english_stemmer.stem(palabra)
        stems.append(result)

In [16]:
frases = []
for frase in stems: 
    frases.append(' '.join(frase))

In [17]:
vectorizer = CountVectorizer(stop_words=stopwords, lowercase=True)

BOW=vectorizer.fit_transform(stems)

In [18]:
vocab_BOW = vectorizer.get_feature_names()
matrix_BOW=pd.DataFrame(BOW.toarray(), columns=vocab_BOW)
matrix_BOW

Unnamed: 0,00,000,00am,00for,00p,00pm,00usd,01,02,03,...,zero,zillion,zip,zishan,zoe,zombi,zone,zoo,äcor,ærom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
192256,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
192257,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
192258,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
from sklearn.decomposition import LatentDirichletAllocation
number_of_topics = 3

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
model.fit(BOW)

In [None]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [None]:
no_top_words = 10
display_topics(model, vocab_BOW, no_top_words)