# Set-up
In this notebook I've provided the primary code I used to perform topic modeling and Vader setinment analyis. More of my code and thought can be seen in the Functions.py file in this repo.

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from tqdm import tqdm
from nltk import word_tokenize, FreqDist
import matplotlib.pyplot as plt
import spacy
import time, os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from Functions as akf
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD, NMF
import pyLDAvis
import pyLDAvis.sklearn
import pickle
from corextopic import corextopic as ct
import spacy
from collections import Counter
python -m spacy download en_core_web_sm

# Unpickling

In [None]:
with open("alaska_hotels_df.pickle", "rb") as to_read:
    main_df = pickle.load(to_read)

# Running Vader sentiment analysis

In [None]:
# Running Vader on reviews in the full df
main_df = akf.vader_scores(main_df)

In [None]:
# Adding a column reflecting net sentiment for each review
main_df["Sentiment net"] = main_df["Vader +"] - main_df["Vader -"]

# Trying pyLDAvis
This didn't yield much but it did inform the parts of my process that followed.

In [None]:
vectorizer = CountVectorizer(stop_words = 'english', lowercase = True, max_df = 0.5)
nl_df_vectorized = vectorizer.fit_transform(nl_X.values)
nl_lda = LatentDirichletAllocation(n_components=20, random_state=0)
nl_lda.fit(nl_df_vectorized)
pyLDAvis.sklearn.prepare(nl_lda, nl_df_vectorized, vectorizer)

# Topic modeling with NMF and LSA
My attempts with NMF and LSA, as well as LDA above, didn't yield clean topics, but I've included mention (and an example call) to acknowledge this part of my process.

In [None]:
# Example of NMF topic modeling with example parameters
akf.run_model(main_df, "nmf", 8, ngram_range=(1,2))

In [None]:
# Example of LSA topic modeling with example parameters
akf.run_model(main_df, "lsa", 8, ngram_range=(1,2))

# Running CorEx
After days of fitting and tuning, CorEx ended up as the foundation of my project. Below is a sample of my CorEx modeling. I saved all (~) of the resulting measures/attributes in a df for analysis later.

In [None]:
# Creating the df in which to store results
corex_df = pd.DataFrame(columns=["Topics", "Anchors", "Topic words", "TC", "TCs", "Labels", "Clusters", "Alpha", "Mis", "P(y)|x", "Words", "N-gram range", "min_df", "max_df"])

In [None]:
# Vectorizing the corpus
corpus = main_df["Clean review no cities"]
vectorizer = TfidfVectorizer(max_df=.5,
    min_df=5,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False)
vectorizer = vectorizer.fit(corpus)
tfidf = vectorizer.transform(corpus)
words = list(np.asarray(vectorizer.get_feature_names()))

In [None]:
# Running CorEx with a range of topic parameters and an otherwise fixed set of parameters
# I did this with a ton of anchor combos
for i in tqdm(range(3, 28)):
    topic_model = ct.Corex(n_hidden=i, words=words, seed=10)
    topic_model.fit(tfidf, words=words, docs=corpus, anchors=[["hotel", "lodge"], ["aurora", "northern lights"]])

    topic_words = []
    for x, topic_ngrams in enumerate(topic_model.get_topics(n_words=10)):
        topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
        topic_words.append("Topic #{}: {}".format(x+1, ", ".join(topic_ngrams)))

    corex_details = {"Topics": i, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "Topic words": topic_words, "TC": topic_model.tc, "TCs": topic_model.tcs, "Labels": topic_model.labels, "Clusters": topic_model.clusters, "Alpha": topic_model.alpha, "Mis": topic_model.mis, "P(y)|x": topic_model.p_y_given_x, "Words": topic_model.words, "N-gram range": (1, 2), "min_df": 5, "max_df": 0.5}

In [None]:
# Adding P(y|x) and label attributes for best-performing CorEx model to main_df
main_df = main_df.merge(pd.DataFrame(corex_df.iloc[8]["P(y)|x"]), left_index=True, right_index=True)
main_df = main_df.merge(pd.DataFrame(corex_df.iloc[8]["Labels"]), left_index=True, right_index=True)

In [None]:
# Renaming new columns
main_df.rename(columns={"0_y": "Topic 1 label", "1_y": "Topic 2 label",	"2_y": 	"Topic 3 label", "3_y": "Topic 4 label", "4_y": "Topic 5 label", "5_y": "Topic 6 label", "6_y": "Topic 7 label", "7_y": "Topic 8 label", "8_y": "Topic 9 label", "9_y": "Topic 10 label", "10_y": "Topic 11 label"}, inplace=True)
main_df.rename(columns={"0_x": "Topic 1 p(y|x)", "1_x": "Topic 2 p(y|x)", "2_x": "Topic 3 p(y|x)", "3_x": "Topic 4 p(y|x)", "4_x": "Topic 5 p(y|x)", "5_x": "Topic 6 p(y|x)", "6_x": "Topic 7 p(y|x)", "7_x": "Topic 8 p(y|x)", "8_x": "Topic 9 p(y|x)", "9_x": "Topic 10 p(y|x)", "10_x": "Topic 11 p(y|x)"}, inplace=True)

In [None]:
# Checking topic counts
final_model_topic_counts = []
for i in range(1, 12):
    final_model_topic_counts.append(f'Topic {i}: ' + str(main_df[f'Topic {i} label'].value_counts()[1]))
final_model_topic_counts

In [None]:
# Adding "Season of stay" column to main_df
main_df["Season of stay"] = main_df["Month of stay"].apply(lambda x: akf.get_season(x))

In [None]:
# Creating latitude bins for latitude-based topic analysis
lat_long_df["Latitude bin"] = pd.cut(lat_long_df["Latitude"], 5)
lat_long_df.groupby("Latitude bin")["Topic 5 p(y|x)"].mean()

# More analysis...