# 1. Topic Modeling

In [None]:
%run ./Text_Normalization_Function.ipynb

In [None]:
import json
import pandas

In [None]:
from sklearn import metrics 
import numpy as np 
import pandas as pd 
import glob, os  
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## Import data

In [None]:
# read txt
with open("./yelp_dataset/final_10000.txt","r") as f:
    text = f.read().splitlines()

## Set Features

In [None]:
no_features = 1000

bow_vectorizer_news = CountVectorizer(max_features=no_features)
bow_news_corpus = bow_vectorizer_news.fit_transform(text)
bow_feature_names_news = bow_vectorizer_news.get_feature_names()

In [None]:
no_topics_news = 8
doc_topic_prior_toy = 0.005  # alpha > 0
topic_word_prior_toy = 0.005 # beta > 0

In [None]:
lda_news = LatentDirichletAllocation(n_components=no_topics_news, 
                                     max_iter=100,
                                    doc_topic_prior = doc_topic_prior_toy,
                                    topic_word_prior = topic_word_prior_toy).fit(bow_news_corpus)

In [None]:
no_top_words_news = 8
display_topics(lda_news, bow_feature_names_news, no_top_words_news)

## Result Visualization

In [None]:
pyLDAvis.enable_notebook()
visualization_panel = pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')
visualization_panel

# 2. Sentiment Analysis & Topic Modeling

In [None]:
data_folder = './yelp_dataset/'
jsonData = '{"name": "Frank", "age": 39}'
jsonToPython = json.loads(jsonData)
with open(data_folder + 'review.json','rb') as f:
    reviews = f.readlines()
    
#each line of the file is a separate JSON object
reviews = [json.loads(review) for review in reviews] 

texts = [review['text'] for review in reviews]
stars = [review['stars'] for review in reviews]

print(texts[3])


In [None]:
test=texts[0:10000]
test_corpus_filter=[[string] for string in test]
test_corpus_normalized=[normalize_corpus(li) for li in test_corpus_filter]

In [None]:
import pandas as pd
import numpy as np
import sys
import nltk
import warnings
warnings.simplefilter(action='ignore')

In [None]:
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
test_text=["".join(li) for li in test_corpus_normalized]

In [None]:
def pos_or_neg(score):
    if score>=0:
        return "positive"
    else:
        return "negative"

In [None]:
cleaned={}
for string in test_text:
    cleaned[string]=pos_or_neg(analyzer.polarity_scores(string)["compound"])

filtered=[]
for review, pol in cleaned.items():
    if pol=="negative":
        filtered.append(review)
len(filtered)

In [None]:
with open('./filtered_neg.txt', 'w') as f:
    for item in filtered:
        f.write("%s,\n" % item)

In [None]:
df = pd.DataFrame.from_dict(cleaned,orient="index")
df.reset_index()

In [None]:
from sklearn import metrics 
import numpy as np 
import pandas as pd 
import glob, os  
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
# read txt
with open("./filtered_neg.txt","r") as f:
    text = f.read().splitlines()

In [None]:
no_features = 1000

bow_vectorizer_news = CountVectorizer(max_features=no_features)
bow_news_corpus = bow_vectorizer_news.fit_transform(text)
bow_feature_names_news = bow_vectorizer_news.get_feature_names()

In [None]:
no_topics_news = 3
doc_topic_prior_toy = 0.05  # alpha > 0
topic_word_prior_toy = 0.05 # beta > 0

In [None]:
lda_news = LatentDirichletAllocation(n_components=no_topics_news, 
                                     max_iter=100,
                                    doc_topic_prior = doc_topic_prior_toy,
                                    topic_word_prior = topic_word_prior_toy).fit(bow_news_corpus)

In [None]:
no_top_words_news = 8
display_topics(lda_news, bow_feature_names_news, no_top_words_news)

In [None]:
pyLDAvis.enable_notebook()
visualization_panel = pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')
visualization_panel