In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')

import re
import numpy
import pandas as pd


import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt


import spacy

import matplotlib.pyplot as pt

# NLTK Stop words
from nltk.corpus import stopwords

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])



def format_topics_sentences(ldamodel, corpus, texts):
        # Init output
        sent_topics_df = pd.DataFrame()

        # Get main topic in each document
        for i, row in enumerate(ldamodel[corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # => dominant topic
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
                else:
                    break
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

        # Add original text to the end of the output
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return(sent_topics_df)


def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts,stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts,bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for review in texts:
        doc = nlp(" ".join(review)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ragini\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  from collections import defaultdict, Sequence, Sized, Iterable, Callable
  from collections import defaultdict, Sequence, Sized, Iterable, Callable


In [2]:
def preprocess_text(data_text):

    #data_text['index'] = data_text.index
    stop_words = stopwords.words('english')
    #stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
    data_words = list(sent_to_words(data_text))

    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    #trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    #trigram_mod = gensim.models.phrases.Phraser(trigram)

    # See trigram example
    #print(bigram_mod[data_words[0]])

    #Remove Stop Words
    data_words_nostops = remove_stopwords(data_words,stop_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops,bigram_mod)

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    
    # filter words occurring in more than 80% of reviews or below 20 reviews
    id2word.filter_extremes(no_below=20, no_above=0.8)
    
    # Create Corpus
    texts = data_lemmatized
    # Term Document Frequency
    return [id2word.doc2bow(text) for text in texts],id2word
    

In [15]:
%%time



def getModel(corpus,id2word):

    # View
    #print(corpus[:10])

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.1,
                                           per_word_topics=True)
    return lda_model


    #print(lda_model.print_topics())

    #df_topic_sents_keywords = format_topics_sentences(lda_model, corpus=corpus, texts=data_text)

    # Format
    #df_dominant_topic = df_topic_sents_keywords.reset_index()
    #df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

    # Show
    #df_dominant_topic.head(10)

Wall time: 0 ns


In [4]:
import pickle

In [15]:
# with open('lda_model.pkl', 'wb') as fp:
#     pickle.dump(lda_model, fp)

In [25]:
with open('lda_model_10.pkl', 'rb') as fp:
    lda_model_10 = pickle.load(fp)

In [19]:
with open('lda_model_20_21.pkl', 'rb') as fp:
    lda_model = pickle.load(fp)
print(lda_model.print_topics())

[(0, '0.048*"coffee" + 0.024*"home" + 0.020*"work" + 0.020*"business" + 0.017*"away" + 0.017*"lamb" + 0.016*"location" + 0.016*"shop" + 0.015*"move" + 0.014*"guess"'), (1, '0.010*"mushroom" + 0.009*"pack" + 0.009*"less" + 0.009*"space" + 0.009*"today" + 0.008*"garlic" + 0.008*"care" + 0.008*"weekend" + 0.007*"easy" + 0.007*"downtown"'), (2, '0.103*"cheese" + 0.063*"wing" + 0.030*"mac" + 0.024*"bread" + 0.023*"night" + 0.023*"butter" + 0.022*"sauce" + 0.022*"buffalo" + 0.018*"plan" + 0.017*"duck"'), (3, '0.118*"bar" + 0.105*"beer" + 0.075*"drink" + 0.063*"selection" + 0.028*"cocktail" + 0.026*"night" + 0.023*"happy_hour" + 0.016*"great" + 0.015*"music" + 0.015*"list"'), (4, '0.157*"sandwich" + 0.055*"bacon" + 0.034*"sunday" + 0.030*"cream" + 0.027*"game" + 0.026*"watch" + 0.018*"fairly" + 0.017*"half" + 0.017*"world" + 0.011*"french_toast"'), (5, '0.138*"burger" + 0.091*"fry" + 0.039*"chip" + 0.029*"mexican" + 0.025*"bland" + 0.025*"potato" + 0.022*"burrito" + 0.021*"salsa" + 0.020*"waf

In [17]:
def getFeatureVector(review,lda_model):
    topics = lda_model.get_document_topics(review)
    values = map(lambda x:x[1],topics)
    return values

In [56]:
user_review_vector = list(map(lambda x:getFeatureVector(x,lda_model),corpus_user_reviews))

In [57]:
user_review_vector_df = pd.DataFrame(user_review_vector,columns=["topic_"+str(x) for x in range(20) ])

In [58]:
user_review_vector_df.head(10)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,0.503811,0.091667,0.165955,0.105207,,,,,,,,,,,,,,,,
1,0.116446,0.216631,0.333195,0.162287,0.099999,,,,,,,,,,,,,,,
2,0.207667,0.148213,0.117325,0.069857,0.408544,,,,,,,,,,,,,,,
3,0.023138,0.104023,0.050598,0.065595,0.053862,0.667778,,,,,,,,,,,,,,
4,0.160643,0.023968,0.038849,0.552838,0.064222,0.043658,0.09485,,,,,,,,,,,,,
5,0.110115,0.118748,0.087446,0.081881,0.328401,0.062149,0.044124,0.13713,,,,,,,,,,,,
6,0.288536,0.113894,0.076791,0.077628,0.256606,0.039694,0.041202,0.07136,,,,,,,,,,,,
7,0.032577,0.058556,0.887343,,,,,,,,,,,,,,,,,
8,0.037225,0.030321,0.053044,0.039477,0.084128,0.375705,0.033422,0.047848,0.022045,0.014038,0.210698,0.044776,,,,,,,,
9,0.047109,0.031893,0.035458,0.041712,0.09972,0.030184,0.275677,0.077027,0.079376,0.158896,0.106581,,,,,,,,,


In [6]:
import pandas as pd
data = pd.read_csv('D:\\Data Mining\\yelp_dataset\\PA\\Restaurants-new\\Restaurants\\train\\PA_train_yelp_academic_dataset_review.csv', error_bad_lines=False);
pd.set_option('display.max_colwidth', -1)
data['new_text']=data['text']+" "+(data['stars']).astype(str)+"stars"
data = data.drop(['funny','stars','date','useful','cool','text'],axis=1)
user_reviews = data.groupby(["user_id"])["new_text"].apply(lambda x:' '.join(x)).reset_index()
business_reviews = data.groupby(["business_id"])["new_text"].apply(lambda x:' '.join(x)).reset_index()

In [None]:
corpus_user_reviews_new,id2word_user = preprocess_text(user_reviews[["new_text"]].values.tolist())

In [21]:
corpus_business_reviews_new,id2word_business = preprocess_text(business_reviews[["new_text"]].values.tolist())

In [7]:
corpus_new,id2word_new = preprocess_text(data[["new_text"]].values.tolist())

In [21]:
%store corpus_new
%store id2word_new

Stored 'corpus_new' (list)
Stored 'id2word_new' (Dictionary)


In [41]:
lda_model_20_user = getModel(corpus_user_reviews_new,id2word_user)


In [42]:
lda_model_20_business = getModel(corpus_business_reviews_new,id2word_business)

In [44]:
lda_model_20_business

<gensim.models.ldamodel.LdaModel at 0x267c2adab00>

In [18]:
import pickle

In [45]:
pickle.dump(lda_model_20_user, open('lda_model_20_user.pkl', 'wb'))

In [46]:
pickle.dump(lda_model_20_business, open('lda_model_20_business.pkl', 'wb'))

In [36]:
def get_topics_weight(data,lda_model,corpus):
    review_vector = list(map(lambda x:getFeatureVector(x,lda_model),corpus))
    review_vector_df = pd.DataFrame(review_vector,columns=["topic_"+str(x) for x in range(20) ])
    reviews_topic= pd.concat([data,review_vector_df],axis=1).drop(['new_text'],axis=1)
    return reviews_topic

In [50]:
user_reviews_topic= get_topics_weight(user_reviews,lda_model_20_user,corpus_user_reviews_new)

In [51]:
user_reviews_topic.to_csv('user_reviews_topic.csv',index=False)

In [52]:
business_reviews_topic = get_topics_weight(business_reviews,lda_model_20_business,corpus_business_reviews_new)

In [53]:
business_reviews_topic.to_csv('business_reviews_topic.csv',index=False)

In [54]:
lda_model_20_21 = getModel(corpus_new,id2word_new)

In [58]:
pickle.dump(lda_model_20_21, open('lda_model_20_21.pkl', 'wb'))

In [59]:
reviews_topic = get_topics_weight(data,lda_model_20_21,corpus_new)

In [60]:
reviews_topic.to_csv('reviews_topic_new.csv',index=False)

In [14]:
corpus_new,id2word_new = preprocess_text(data[["new_text"]].values.tolist())
with open('lda_model_20_21.pkl', 'rb') as fp:
    lda_model_20_21 = pickle.load(fp)

data_vis = pyLDAvis.gensim.prepare(lda_model_20_21, corpus_new, id2word_new)

In [17]:
pyLDAvis.enable_notebook()

In [18]:
data_vis

In [12]:
%store -r corpus_new

no stored variable corpus_new


In [2]:
import pandas as pd
valid_data = pd.read_csv('D:\\Data Mining\\yelp_dataset\\PA\\Restaurants-new\\Restaurants\\valid\\PA_valid_yelp_academic_dataset_review.csv', error_bad_lines=False);
valid_data['new_text']=valid_data['text']+" "+(valid_data['stars']).astype(str)+"stars"
valid_data = valid_data.drop(['funny','stars','date','useful','cool','text'],axis=1)
corpus_new,_= preprocess_text(valid_data[["new_text"]].values.tolist())
valid_data_df = get_topics_weight(valid_data,lda_model_20_21,corpus_new)
valid_data_df.to_csv('valid/reviews_topic_valid.csv',index=False)

NameError: name 'preprocess_text' is not defined

In [64]:
test_data = pd.read_csv('D:\\Data Mining\\yelp_dataset\\PA\\Restaurants-new\\Restaurants\\test\\PA_test_yelp_academic_dataset_review.csv', error_bad_lines=False);
test_data['new_text']= test_data['text']+" "+(test_data['stars']).astype(str)+"stars"
test_data = test_data.drop(['funny','stars','date','useful','cool','text'],axis=1)
corpus_new,_= preprocess_text(test_data[["new_text"]].values.tolist())
test_data_df = get_topics_weight(test_data,lda_model_20_21,corpus_new)
test_data_df.to_csv('test/reviews_topic_test.csv',index=False)

In [37]:
with open('lda_model_20_21.pkl', 'rb') as fp:
    lda_model_20_21 = pickle.load(fp)

In [25]:
data_valid = pd.read_csv('D:\\Data Mining\\yelp_dataset\\PA\\Restaurants-new\\Restaurants\\valid\\PA_valid_yelp_academic_dataset_review.csv', error_bad_lines=False);
data_valid['new_text']=data_valid['text']+" "+(data_valid['stars']).astype(str)+"stars"
data_valid = data_valid.drop(['funny','stars','date','useful','cool','text'],axis=1)
user_reviews_valid = data_valid.groupby(["user_id"])["new_text"].apply(lambda x:' '.join(x)).reset_index()
business_reviews_valid = data_valid.groupby(["business_id"])["new_text"].apply(lambda x:' '.join(x)).reset_index()
corpus_valid,_= preprocess_text(business_reviews_valid[["new_text"]].values.tolist())


In [55]:
data_test = pd.read_csv('D:\\Data Mining\\yelp_dataset\\PA\\Restaurants-new\\Restaurants\\test\\PA_test_yelp_academic_dataset_review.csv', error_bad_lines=False);
data_test['new_text']=data_test['text'].astype(str)+" "+(data_test['stars']).astype(str)+"stars"
data_test = data_test.drop(['funny','stars','date','useful','cool','text'],axis=1)
user_reviews_test = data_test.groupby(["user_id"])["new_text"].apply(lambda x:' '.join(x)).reset_index()
business_reviews_test = data_test.groupby(["business_id"])["new_text"].apply(lambda x:' '.join(x)).reset_index()
corpus_test,_= preprocess_text(business_reviews_test["new_text"].tolist())


In [57]:
business_reviews_topic = get_topics_weight(business_reviews_valid,lda_model_20_21,corpus_valid)
business_reviews_topic.to_csv('valid/business_reviews_topic_valid.csv',index=False)



In [58]:
business_reviews_topic = get_topics_weight(business_reviews_test,lda_model_20_21,corpus_test)
business_reviews_topic.to_csv('test/business_reviews_topic_test.csv',index=False)