In [5]:
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.decomposition import TruncatedSVD, NMF
from nltk.corpus import stopwords
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
import nltk
from tqdm import tqdm
from nltk import word_tokenize, FreqDist
import matplotlib.pyplot as plt
import pkg_resources
from symspellpy import SymSpell, Verbosity
import spacy
import datetime as dt
import time, os

In [6]:
with open("newest_main_df.pickle", "rb") as to_read:
    main_df = pickle.load(to_read)

## Dropping NaNs in "Full review" and spell-checking all words longer than five letters
 - 111529 -> 94513 NaNs (possible that other kinds of cleaning/filtering helpef before all_hotels_df was unpickled here, however)
- 94295 reviews were changed by SymSpell spell-check function and 218 were not
    - worth noting that periods and spaces were updated so not all included actual typos or spelling errors

In [3]:
#main_df = spell_checker(all_hotels_df, pickling=True)

100%|██████████| 94513/94513 [01:45<00:00, 899.77it/s]


## Cleaning spell-checked reviews
Took 17 minutes to run!


In [4]:
# main_df = review_cleaner(main_df)

100%|██████████| 94513/94513 [17:09<00:00, 91.79it/s]


## Adding a month column from "Date of stay"

In [20]:
# main_df["Month of stay"] = pd.DatetimeIndex(main_df["Date of stay"]).month
# main_df["Year of stay"] = pd.DatetimeIndex(main_df["Date of stay"]).year

In [28]:
# main_df["Month of stay"].value_counts()

7.0     22284
8.0     20829
6.0     19069
9.0      9875
5.0      7889
3.0      2819
10.0     2420
4.0      2021
2.0      2015
1.0      1555
11.0     1535
12.0     1517
Name: Month of stay, dtype: int64

In [156]:
with open("all_alaska_hotels_spell_checked.pickle", "wb") as to_write:
     pickle.dump(main_df, to_write)

## Checking top words
- Sorted by month, they are all essentially and concern hotel matters
- Ran LSA and NMF with 2 topics each and didn't get much
    - In LSA, explained variance ratios were 0.0029 and 0.0075 respectively and both related to hotel matters

In [4]:
top_words(main_df, separater="Month of stay")


Most common words: 10.0:
1. ('room', 2385)
2. ('stay', 2180)
3. ('hotel', 1696)
4. ('great', 1075)
5. ('staff', 1057)
6. ('good', 1021)
7. ('clean', 1019)
8. ('breakfast', 914)
9. ('place', 849)
10. ('nice', 745)

Most common words: 9.0:
1. ('room', 10119)
2. ('stay', 8302)
3. ('hotel', 5439)
4. ('great', 4307)
5. ('good', 4246)
6. ('clean', 3867)
7. ('night', 3683)
8. ('breakfast', 3478)
9. ('staff', 3447)
10. ('place', 3443)

Most common words: 6.0:
1. ('room', 19387)
2. ('stay', 15908)
3. ('hotel', 9214)
4. ('great', 8210)
5. ('good', 7657)
6. ('clean', 7194)
7. ('night', 7131)
8. ('breakfast', 7116)
9. ('place', 7061)
10. ('one', 6202)

Most common words: 7.0:
1. ('room', 21867)
2. ('stay', 18648)
3. ('hotel', 10437)
4. ('great', 9706)
5. ('good', 8903)
6. ('place', 8727)
7. ('clean', 8454)
8. ('night', 8219)
9. ('breakfast', 8047)
10. ('nice', 7136)

Most common words: 8.0:
1. ('room', 20802)
2. ('stay', 16972)
3. ('hotel', 10108)
4. ('great', 9057)
5. ('good', 8542)
6. ('night',

In [4]:
run_model(main_df, "lsa", 2)


Topic 1
['staff', 'nice', 'clean', 'good', 'great', 'hotel']
Explained variance ratio: 0.0028742653671233478

Topic 2
['anchorage', 'downtown', 'staff', 'shuttle', 'airport', 'hotel']
Explained variance ratio: 0.007528947079170521


In [5]:
run_model(main_df, "nmf", 2)


Topic 1
['breakfast', 'nice', 'clean', 'good', 'staff', 'hotel']

Topic 2
['make', 'view', 'place', 'lodge', 'great', 'cabin']


In [4]:
results_df = run_model(main_df, "nmf", 2, results_df=True)


Topic 1
['nice', 'clean', 'good', 'staff', 'hotel']

Topic 2
['view', 'place', 'lodge', 'great', 'cabin']


In [10]:
results_df.sort_values(by=0, ascending=False)

Unnamed: 0,0,1,Full review,Cleaned review
15745,0.084707,0.000000,We really enjoyed our stay at this hotel. All...,really enjoy stay hotel hotel staff nice room ...
93338,0.080808,0.000000,"We stayed here, downtown, for 3 nights. There ...",stay downtown night hotel shuttle ferry expect...
14629,0.078912,0.000000,"Although this hotel isn't located downtown, it...",although hotel be not locate downtown nice loc...
2401,0.078685,0.000000,My husband and I have stayed here several time...,husband stay several time seem like old hotel ...
7778,0.075515,0.000000,This hotel is very convenient to the airport. ...,hotel convenient airport good shuttle service ...
...,...,...,...,...
11119,0.000000,0.039544,"After all day driving and sightseeing, Larry w...",day drive sightseeing larry quick get check co...
54754,0.000000,0.038839,,
11117,0.000000,0.037809,We stayed for two nights and found it to be on...,stay two night find one good cabin stay kitche...
54752,0.000000,0.047527,,


In [61]:
results_df = run_model(main_df, "nmf", 5, results_df=True)


Topic 1
['shuttle', 'restaurant', 'good', 'staff', 'hotel']

Topic 2
['wonderful', 'view', 'breakfast', 'place', 'great']

Topic 3
['small', 'creek', 'park', 'denali', 'cabin']

Topic 4
['day', 'trip', 'fish', 'fishing', 'lodge']

Topic 5
['nice', 'small', 'bathroom', 'night', 'bed']


In [46]:
results_df.sort_values(by=2, ascending=False)

Unnamed: 0,0,1,2,Full review,Cleaned review
65984,0.000000,0.000000,0.141851,,
60270,0.000000,0.000000,0.122540,,
40175,0.000000,0.000000,0.122437,We stayed at McKinley Creekside Cabins on 7-30...,stay mckinley creekside cabin cabin clean love...
64795,0.000000,0.000000,0.122111,,
40065,0.000000,0.000000,0.121195,Four adults stayed for two nights in the Carlo...,four adult stay two night carlo cabin staff ex...
...,...,...,...,...,...
76420,0.025359,0.008739,0.000000,Favorite stay of our 10-day Alaskan adventure....,favourite stay day alaskan adventure view bb s...
76419,0.029318,0.000000,0.000000,I did a lot of searching on TripAdvisor before...,lot search tripadvisor book bed breakfast even...
60890,0.040815,0.000000,0.000000,,
31566,0.033260,0.014950,0.000000,"Well located, clean and comfortable. Good opt...",well locate clean good option fisherman overni...


## Trying Vader

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
main_df = vader_scores(main_df)

100%|██████████| 94513/94513 [00:43<00:00, 2187.31it/s]


In [11]:
main_df.groupby("Month of stay")["Vader +"].mean()

Month of stay
1.0     0.333032
2.0     0.329967
3.0     0.326659
4.0     0.333819
5.0     0.314747
6.0     0.307647
7.0     0.306632
8.0     0.304542
9.0     0.312721
10.0    0.333707
11.0    0.332612
12.0    0.332488
Name: Vader +, dtype: float64

In [12]:
main_df.groupby("Month of stay")["Vader -"].mean()

Month of stay
1.0     0.043122
2.0     0.040432
3.0     0.041645
4.0     0.041874
5.0     0.040397
6.0     0.042529
7.0     0.042481
8.0     0.043819
9.0     0.041674
10.0    0.041539
11.0    0.042252
12.0    0.040735
Name: Vader -, dtype: float64

## Trying to get hotel rates

In [13]:
with open("hotel_urls.pickle", "rb") as to_read:
    hotel_urls = pickle.load(to_read)

In [35]:
def get_hotel_rates(list_of_urls):
    '''Takes a list of URLs and grabs the rate for each, returning it as the value to the
    property name key.
    '''
    rate_dict = {}

    chromedriver = "/Applications/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    driver.get("https://www.tripadvisor.com/Hotels-g28923-Alaska-Hotels.html")
    hotels_toggle = driver.find_element_by_xpath('/html/body/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[5]/div/div/div[2]/div[2]/div[2]/div[4]/div/label')
    hotels_toggle.click()
    for i in range(99):
        print(i)
        time.sleep(7)
        soup = BeautifulSoup(driver.page_source)
        listings = soup.find("div", id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0").find_all("div", class_="prw_rup prw_meta_hsx_responsive_listing ui_section listItem")
        for listing in listings:
            property = listing.find("div", class_="listing_title").text
            print(property)
            try:
                rate = int("".join(listing.find("div", class_="price-wrap").text.split("$")[1].split(",")))
            except AttributeError:
                rate = np.NaN
            rate_dict[property] = rate
        next_button = driver.find_element_by_xpath('//*[@id="taplc_main_pagination_bar_dusty_hotels_resp_0"]/div/div/div/span[2]')
        next_button.click()
        with open("hotel_rates.pickle", "wb") as to_write:
            pickle.dump(rate_dict, to_write)

    return rate_dict

In [36]:
rate_dict = get_hotel_rates(hotel_urls)

0
      Harbor 360 Hotel
      Alyeska Resort
      Pike's Waterfront Lodge
      Aurora Denali Lodge
      The Hotel Captain Cook
      Denali Lakeview Inn
      Land's End Resort
      The Lakefront Anchorage
      Embassy Suites by Hilton Anchorage
      Chena Hot Springs Resort
      Denali Fireside Cabins & Suites
      La Quinta Inn & Suites by Wyndham Anchorage Airport
      La Quinta Inn & Suites by Wyndham Fairbanks Airport
      SpringHill Suites by Marriott Fairbanks
      Best Western Golden Lion Hotel
      The Voyager Inn
      Wedgewood Resort
      Best Western Grandma'S Feather Bed
      Sophie Station Suites
      Cape Fox Lodge
      Puffin Inn of Anchorage
      Aptel Studio Hotel
      Northern Sky Lodge
      Aspen Suites Hotel Homer
      Hotel Seward
      Aspen Suites Hotel Anchorage
      Lake Louise Lodge
      Coast Inn at Lake Hood
      Aspen Hotel Soldotna
      Driftwood Hotel
1
      Dimond Center Hotel
      Sheep Mountain Lodge
      Talkeetna Hideawa

IndexError: list index out of range

In [37]:
with open("hotel_rates.pickle", "rb") as to_read:
    rates_dict = pickle.load(to_read)

## Inputting lats and longs as floats

In [29]:
for_mapping_df = main_df[main_df["Lat, long"].notna()]
lats = []
longs = []
for i in range(for_mapping_df.shape[0]):
    lats.append(float(for_mapping_df.iloc[i]["Lat, long"].split(",")[0]))
    longs.append(float(for_mapping_df.iloc[i]["Lat, long"].split(",")[1]))
for_mapping_df["Latitude"] = lats
for_mapping_df["Longitude"] = longs

In [30]:
for_mapping_df.to_csv("Updated df.csv")

## Cleaning out more stopwords
NLTK's 'english' stopwords don't seem to have done an adequate job, so I'm using sci-kit learn's

In [75]:
def clean_again(review):

    cleaned_review = ''
    cleaned_words = 0
    for word in review.split():
        if word not in ENGLISH_STOP_WORDS:
            cleaned_review += word + ' '
        else:
            cleaned_words += 1
    print(cleaned_words)

    return cleaned_review

In [76]:
main_df["Cleaned review v2"] = main_df["Cleaned review"].apply(lambda x: clean_again(x))

2
7
9
8
1
6
7
2
1
2
8
3
9
2
11
6
0
9
10
5
5
17
17
18
5
13
3
7
6
8
15
12
8
6
9
4
4
2
6
7
9
5
3
17
12
8
1
5
5
10
1
10
11
15
10
5
1
7
11
13
12
7
10
17
11
12
4
8
14
3
4
4
18
7
2
5
6
12
5
11
4
5
10
11
6
1
13
8
6
10
11
3
7
11
11
13
6
13
2
7
1
8
11
12
8
8
9
6
1
9
14
11
18
7
9
14
12
7
13
7
10
2
9
7
7
16
6
1
5
5
6
10
7
13
4
10
10
5
18
8
0
15
2
12
15
6
10
8
14
5
5
5
9
3
8
5
4
2
2
11
3
5
3
4
5
3
2
5
17
6
4
5
10
11
10
1
4
9
4
4
12
12
0
8
6
4
15
5
3
2
2
14
8
6
9
5
10
8
4
5
10
17
6
6
6
14
20
3
18
8
7
9
10
5
6
15
5
1
4
6
6
13
11
2
17
2
8
9
6
1
8
3
7
11
10
6
2
12
2
3
15
20
1
9
2
6
6
14
12
12
11
8
2
2
5
4
12
4
13
6
12
4
4
4
2
23
8
4
7
17
12
4
6
13
12
8
8
1
10
8
4
5
1
6
7
5
4
5
17
6
6
2
11
2
11
8
9
7
2
2
0
16
7
7
5
2
5
5
6
10
7
5
3
0
14
11
4
8
5
10
5
2
6
6
11
1
3
2
10
2
4
4
1
10
2
16
5
7
9
6
9
2
13
8
7
9
4
7
9
4
11
7
8
6
7
10
9
4
4
4
5
4
7
5
14
1
1
8
8
5
14
9
6
9
16
9
7
12
5
4
6
12
9
7
8
1
12
6
25
5
5
9
7
13
5
1
5
6
14
1
4
11
8
4
7
6
15
8
13
6
13
1
7
10
9
3
7
4
5
1
4
12
4
2
3
11
5
0
4
9
9
7
7
8
10
2
8
5

In [85]:
results_df = run_model(main_df, "nmf", 8, results_df=True, ngram_range=(1,2))


Topic 1
['small', 'nice', 'bathroom', 'night', 'bed']

Topic 2
['wonderful', 'view', 'breakfast', 'place', 'great']

Topic 3
['owner', 'creek', 'stay cabin', 'cabin clean', 'cabin']

Topic 4
['day', 'trip', 'fish', 'lodge', 'fishing']

Topic 5
['view', 'restaurant', 'lodge', 'park', 'denali']

Topic 6
['clean', 'staff friendly', 'helpful', 'friendly', 'staff']

Topic 7
['downtown', 'restaurant', 'stay hotel', 'anchorage', 'hotel']

Topic 8
['downtown', 'service', 'free', 'airport', 'shuttle']


In [86]:
top_words(main_df, 100)



1. ('room', 95733)
2. ('stay', 79677)
3. ('hotel', 49682)
4. ('great', 41285)
5. ('good', 38315)
6. ('clean', 36405)
7. ('place', 35089)
8. ('breakfast', 34819)
9. ('night', 33833)
10. ('staff', 32329)
11. ('nice', 30199)
12. ('one', 28315)
13. ('would', 27430)
14. ('get', 26978)
15. ('well', 25153)
16. ('bed', 24154)
17. ('make', 23851)
18. ('view', 23321)
19. ('go', 23163)
20. ('day', 23061)
21. ('cabin', 22246)
22. ('lodge', 22197)
23. ('time', 21468)
24. ('not', 21174)
25. ('friendly', 20651)
26. ('restaurant', 19088)
27. ('alaska', 18971)
28. ('area', 18586)
29. ('location', 17372)
30. ('could', 16969)
31. ('like', 16776)
32. ('trip', 15885)
33. ('two', 15366)
34. ('need', 15366)
35. ('helpful', 15311)
36. ('walk', 15309)
37. ('food', 15055)
38. ('take', 14970)
39. ('back', 14161)
40. ('service', 13909)
41. ('small', 13200)
42. ('want', 13130)
43. ('see', 12947)
44. ('really', 12375)
45. ('anchorage', 12315)
46. ('find', 12030)
47. ('bathroom', 11913)
48. ('beautiful', 11710)
49

## Trying CorEx

In [2]:
from corextopic import corextopic as ct

In [8]:
corpus = main_df['Cleaned review v2']
tfidf = TfidfVectorizer(stop_words="english", max_df=0.5, max_features=None, ngram_range=(1, 3), norm=None, binary=True, sublinear_tf=False)
df_vectorized = tfidf.fit_transform(corpus)
vocab = tfidf.get_feature_names()

In [5]:
df_vectorized.toarray().shape

(94513, 3964545)

In [9]:
# This is for comparing with the results in the cell below

corex = ct.Corex(n_hidden=5, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors=[["hotel", "lodge"], ["aurora", "northern lights"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

KeyboardInterrupt: 

In [104]:
corex = ct.Corex(n_hidden=5, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: hotel, shuttle, desk, lobby, restaurant, airport
Topic #2: aurora, light, boreal, northern, like, aurora boreal, know, guest, want, northern light
Topic #3: fishing, fish, guide, salmon, catch, halibut, trip, bear, boat, experience
Topic #4: coffee, microwave, bed, bathroom, fruit, egg, fridge, tv, kitchen, cereal
Topic #5: outdoor, email, cool, write, young, furnish, kind, arrangement, creamer, landscape


In [138]:
corex = ct.Corex(n_hidden=10, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = ["hotel", "lodge", "cabin"])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: hotel, desk, shuttle, staff, lobby, restaurant, service, stay hotel
Topic #2: lodge, fishing, fish, guide, national park, national, river, denali, salmon, denali national
Topic #3: cabin, view, beautiful, mountain, lake, glacier, deck, enjoy, cabin clean, stay cabin
Topic #4: day, trip, alaska, dinner, meal, spend, trip alaska, lunch, week, adventure
Topic #5: breakfast, bb, living, fruit, bedroom, egg, house, delicious, kitchen, living room
Topic #6: bed, floor, microwave, tv, fridge, smell, small, carpet, wall, door
Topic #7: arrive, check, tell, say, review, book, ask, late, leave, reservation
Topic #8: make, home, feel, experience, make feel, thank, wonderful, family, year, feel like
Topic #9: bathroom, shower, main, outside, water, building, large, window, chair, table
Topic #10: downtown, airport, walk, distance, walk distance, locate, train, free, close, location


In [139]:
corex_details = {"Topics": 10, "Anchors": [["hotel", "lodge", "cabin"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

In [140]:
corex_df = corex_df.append(corex_details, ignore_index=True)

In [146]:
with open("corex_df.pickle", "wb") as to_write:
    pickle.dump(corex_df, to_write)

In [141]:
corex = ct.Corex(n_hidden=5, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["staff"], ["nature"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: hotel, desk, floor, shuttle, cruise, lobby, bad, princess
Topic #2: cabin, host, bb, beautiful, house, owner, denali, view
Topic #3: fishing, fish, guide, trip, salmon, bear, experience, boat, catch, halibut
Topic #4: walk, downtown, distance, restaurant, microwave, free, bed, small, walk distance, shop
Topic #5: arrive, tell, say, check, day, leave, book, didst, night, review


In [145]:
for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: hotel, desk, floor, shuttle, cruise, lobby, bad, princess
Topic #2: cabin, host, bb, beautiful, house, owner, denali, view
Topic #3: fishing, fish, guide, trip, salmon, bear, experience, boat, catch, halibut
Topic #4: walk, downtown, distance, restaurant, microwave, free, bed, small, walk distance, shop
Topic #5: arrive, tell, say, check, day, leave, book, didst, night, review


In [148]:
corex = ct.Corex(n_hidden=5, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["nature", "beautiful"], ["breakfast", "cereal"], ["check", "staff"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: hotel, floor, didst, open, tell, bad, door, window, night, pay
Topic #2: fishing, fish, guide, trip, experience, home, salmon, make, bear, alaska
Topic #3: coffee, breakfast, bed, microwave, kitchen, fridge, bedroom, bathroom, fruit, tv
Topic #4: cabin, host, bb, owner, denali, park, house


In [152]:
corex = ct.Corex(n_hidden=5, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["nature", "beautiful"], ["breakfast", "cereal"], ["check", "staff"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: hotel, floor, restaurant, shuttle, cruise, lobby, small, princess, window, walk
Topic #2: fishing, fish, guide, trip, salmon, experience, bear, amazing, home, beautiful
Topic #3: coffee, bed, microwave, breakfast, kitchen, fridge, bathroom, bedroom, fruit, tv
Topic #4: cabin, host, bb, denali, owner, park, house
Topic #5: arrive, tell, say, book, didst, review, leave, ask, day, want


In [157]:
corex_details = {"Topics": 5, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

Topic #1: hotel, floor, restaurant, shuttle, cruise, lobby, small, princess, window, walk
Topic #2: fishing, fish, guide, trip, salmon, experience, bear, amazing, home, beautiful
Topic #3: coffee, bed, microwave, breakfast, kitchen, fridge, bathroom, bedroom, fruit, tv
Topic #4: cabin, host, bb, denali, owner, park, house
Topic #5: arrive, tell, say, book, didst, review, leave, ask, day, want


In [154]:
corex_df = corex_df.append(corex_details, ignore_index=True)

In [None]:
corex = ct.Corex(n_hidden=4, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 4, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

corex_df = corex_df.append(corex_details, ignore_index=True)

In [158]:
corex = ct.Corex(n_hidden=5, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 5, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, restaurant, airport, desk, downtown, lobby, cruise, free, staff
Topic #2: fishing, fish, guide, trip, salmon, catch, boat, bear, halibut, experience
Topic #3: cabin, denali, view, park, mountain, beautiful, river, national park, national, deck
Topic #4: host, home, bb, make, wonderful, feel, delicious, make feel, house, amazing
Topic #5: bed, bathroom, floor, didst, tell, night, door, tv, shower, window


In [159]:
corex = ct.Corex(n_hidden=6, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 6, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, desk, staff, airport, restaurant, lobby
Topic #2: fishing, fish, guide, salmon, trip, catch, bear, boat, halibut, day
Topic #3: make, home, experience, wonderful, feel, thank, amazing, family, make feel, alaska
Topic #4: breakfast, view, bb, beautiful, mountain, delicious, enjoy, fruit, deck, egg
Topic #5: bathroom, bed, small, shower, tv, microwave, window, floor, fridge, coffee
Topic #6: tell, say, check, arrive, didst, review, bad, ask, dirty, book


In [160]:
corex = ct.Corex(n_hidden=7, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 7, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, desk, shuttle, staff, lobby, airport
Topic #2: denali, park, national park, national, kitchen, fruit, living, bedroom, egg, denali national
Topic #3: arrive, check, tell, say, book, review, leave, early, late, day
Topic #4: bed, bathroom, shower, microwave, floor, tv, small, window, fridge, door
Topic #5: downtown, walk, distance, shop, restaurant, walk distance, locate, location, free, town
Topic #6: view, beautiful, mountain, enjoy, glacier, deck, bay, homer, wonderful, amazing
Topic #7: fishing, fish, guide, trip, salmon, bear, experience, catch, boat, halibut


In [161]:
corex = ct.Corex(n_hidden=8, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 8, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, desk, staff, lobby, airport, service
Topic #2: kitchen, arrive, night, bedroom, book, review, living, late, early, queen
Topic #3: cabin, denali, park, national park, national, river, talkeetna, denali national, mckinley, cabin clean
Topic #4: view, beautiful, mountain, enjoy, wonderful, amazing, glacier, bay, lake, homer
Topic #5: fishing, fish, guide, trip, salmon, experience, catch, bear, halibut, boat
Topic #6: breakfast, coffee, microwave, fridge, continental, fruit, continental breakfast, egg, bed, cereal
Topic #7: floor, bathroom, window, open, door, tell, shower, smell, bad, didst
Topic #8: downtown, walk, distance, shop, restaurant, walk distance, locate, location, town, street


In [162]:
corex = ct.Corex(n_hidden=9, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 9, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, staff, desk, shuttle, lobby
Topic #2: view, mountain, beautiful, glacier, deck, lake, bay, enjoy, bear, spectacular
Topic #3: food, river, dinner, tour, princess, lunch, meal, land, eat, holland
Topic #4: floor, bathroom, window, shower, smell, door, wall, carpet, open, noise
Topic #5: arrive, check, tell, say, late, book, ask, leave, reservation, early
Topic #6: fishing, fish, guide, trip, salmon, catch, experience, halibut, boat, fishing trip
Topic #7: downtown, walk, distance, shop, walk distance, restaurant, locate, location, train, town
Topic #8: bed, coffee, microwave, kitchen, fridge, bedroom, tv, breakfast, queen, living
Topic #9: park, denali, national, national park, rv, denali national, road, site, rv park, entrance


In [163]:
corex = ct.Corex(n_hidden=10, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 10, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 2), "Words": corex.words, "Other": "min_df=5"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, desk, staff, airport, lobby, free, stay hotel
Topic #2: bathroom, bed, kitchen, microwave, tv, shower, fridge, bedroom, coffee, queen
Topic #3: fishing, fish, guide, trip, experience, salmon, bear, make, catch, home
Topic #4: denali, tour, park, national park, national, princess, dinner, denali national, mckinley, bus
Topic #5: breakfast, bb, delicious, fruit, egg, cereal, house, juice, fresh, sausage
Topic #6: floor, noise, bad, window, dirty, carpet, smell, open, door, wall
Topic #7: arrive, check, tell, say, book, review, late, leave, early, day
Topic #8: downtown, walk, distance, shop, restaurant, walk distance, locate, location, town, close
Topic #9: view, mountain, beautiful, great view, deck, overlook, view room, beautiful view, sit, main
Topic #10: cabin, rv, homer, cabin clean, stay cabin, campground, rv park, beach, creek, pit


In [165]:
corex = ct.Corex(n_hidden=4, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 4, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 3), "Words": corex.words, "Other": "min_df=7, max_df=0.4"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, desk, lobby, restaurant, airport, cruise
Topic #2: bathroom, bed, small, shower, tv, microwave, park, kitchen, night, denali
Topic #3: fishing, fish, guide, trip, make, home, experience, beautiful, salmon, bear
Topic #4: breakfast, bb, continental, continental breakfast, downtown, fruit, buffet, breakfast good, egg, morning


In [166]:
corex = ct.Corex(n_hidden=5, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 5, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 3), "Words": corex.words, "Other": "min_df=7, max_df=0.4"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, floor, desk, check, didst, tell, open, bad, door, bathroom
Topic #2: cabin, host, bb, owner, home, kitchen, house, feel
Topic #3: fishing, fish, guide, trip, view, salmon, bear, boat, beautiful, experience
Topic #4: breakfast, continental, buffet, breakfast good, fruit, continental breakfast, egg, bed breakfast, morning, breakfast morning
Topic #5: walk, downtown, distance, park, restaurant, shop, walk distance, locate, denali, microwave


In [167]:
corex = ct.Corex(n_hidden=6, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 6, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 3), "Words": corex.words, "Other": "min_df=7, max_df=0.4"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, restaurant, airport, downtown, cruise, walk, distance, staff, walk distance
Topic #2: host, home, make, wonderful, bb, trip, feel, experience, make feel, amazing
Topic #3: cabin, denali, fishing, river, view, fish, park, national park, national, guide
Topic #4: breakfast, continental, buffet, breakfast good, fruit, continental breakfast, egg, bed breakfast, morning, breakfast morning
Topic #5: bed, bathroom, floor, microwave, tv, shower, window, small, fridge, sink
Topic #6: arrive, check, tell, say, book, review, desk, ask, leave, didst


In [168]:
corex = ct.Corex(n_hidden=7, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 7, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 3), "Words": corex.words, "Other": "min_df=7, max_df=0.4"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, restaurant, airport, downtown, walk, cruise, distance, staff, walk distance
Topic #2: host, home, make, wonderful, bb, feel, trip, alaska, make feel, experience
Topic #3: cabin, denali, fishing, fish, river, park, guide, national park, national, salmon
Topic #4: breakfast, continental, buffet, breakfast good, fruit, continental breakfast, egg, bed breakfast, breakfast morning, morning
Topic #5: bathroom, bed, kitchen, microwave, shower, tv, fridge, bedroom, small, queen
Topic #6: floor, window, noise, door, smell, open, carpet, wall, dirty, bad
Topic #7: arrive, check, tell, say, desk, book, review, late, reservation, ask


In [169]:
corex = ct.Corex(n_hidden=8, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 8, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 3), "Words": corex.words, "Other": "min_df=7, max_df=0.4"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, airport, desk, cruise, lobby, shuttle service, stay hotel, hotel room, free
Topic #2: make, home, feel, alaska, make feel, feel like, make sure, day, family, time
Topic #3: fishing, fish, guide, salmon, trip, bear, boat, catch, halibut, experience
Topic #4: breakfast, continental, buffet, breakfast good, fruit, continental breakfast, egg, bed breakfast, morning, breakfast morning
Topic #5: bed, bathroom, floor, microwave, tv, window, shower, door, open, didst
Topic #6: host, bb, kitchen, house, owner, homer, private, wonderful, beautiful
Topic #7: cabin, denali, park, view, mountain, national park, national, river, main, denali national
Topic #8: downtown, walk, restaurant, distance, shop, walk distance, location, locate, close, street


In [170]:
corex = ct.Corex(n_hidden=9, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 9, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 3), "Words": corex.words, "Other": "min_df=7, max_df=0.4"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, restaurant, staff, airport, cruise, lobby, desk, free
Topic #2: host, home, make, bb, feel, wonderful, make feel, house, alaska, family
Topic #3: fishing, fish, guide, trip, salmon, bear, boat, catch, halibut, experience
Topic #4: breakfast, continental, buffet, breakfast good, fruit, continental breakfast, egg, bed breakfast, morning, breakfast morning
Topic #5: cabin, denali, view, mountain, park, beautiful, national park, national, river, deck
Topic #6: arrive, check, say, tell, review, book, leave, reservation, late, ask
Topic #7: downtown, walk, distance, walk distance, locate, location, harbour, seward, town, easy
Topic #8: floor, window, noise, smell, carpet, door, open, dirty, wall, air
Topic #9: bathroom, bed, shower, microwave, tv, kitchen, fridge, small, queen, bedroom


In [171]:
corex = ct.Corex(n_hidden=10, seed=10)
corex = corex.fit(df_vectorized, words=vocab, anchors = [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]])

for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 10, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"], ["nature"], ["breakfast"]], "TC": corex.tc, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "N-gram range": (1, 3), "Words": corex.words, "Other": "min_df=7, max_df=0.4"}

corex_df = corex_df.append(corex_details, ignore_index=True)

Topic #1: hotel, shuttle, desk, airport, lobby, stay hotel, service, shuttle service
Topic #2: night, arrive, early, late, day, living, spend, living room, flight, stay night
Topic #3: fishing, fish, guide, trip, salmon, bear, boat, catch, halibut, lake
Topic #4: breakfast, continental, breakfast good, buffet, fruit, continental breakfast, egg, bed breakfast, breakfast morning, morning
Topic #5: tell, floor, didst, say, bad, open, window, door, smell, dirty
Topic #6: host, home, make, wonderful, bb, feel, make feel, amazing, beautiful, delicious
Topic #7: denali, park, tour, national park, national, princess, denali national, denali national park, mckinley, cruise
Topic #8: shop, walk, microwave, wifi, restaurant, rv, fridge, laundry, street, coffee
Topic #9: friendly, downtown, staff, helpful, staff friendly, friendly helpful, distance, walk distance, location, staff friendly helpful
Topic #10: bathroom, bed, kitchen, shower, bedroom, tub, queen, hot, tv, bath


In [180]:
corex_df

Unnamed: 0,Topics,Anchors,TC,Labels,Clusters,Alpha,Mis,N-gram range,Words,Other
0,5,"[[hotel, lodge], [aurora, northern lights]]",22.55491,"[[True, False, True, False, False], [False, Fa...","[0, 0, 4, 1, 2, 3, 1, 0, 4, 1, 0, 1, 3, 0, 4, ...","[[1.0, 1.0, 1.2850321286285925e-100, 1.4588700...","[[1.2628946668467853e-05, 1.8021980888024995e-...","(1, 2)","[aa, aa battery, aa cancel, aa coffee, aa eva,...",No min_df
1,6,"[[hotel, lodge, cabin]]",15.937173,"[[True, False, False, False, True, False], [Fa...","[5, 0, 5, 0, 0, 1, 5, 0, 1, 5, 5, 1, 3, 0, 0, ...","[[9.489863603125979e-09, 1.0, 6.91939798213093...","[[0.0, 0.00028351982125180477, 0.0, 8.14448452...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5
2,10,"[[hotel, lodge, cabin]]",18.50859,"[[True, False, False, False, False, True, Fals...","[8, 6, 6, 0, 6, 7, 6, 0, 1, 6, 8, 3, 6, 7, 7, ...","[[0.0002204576955178804, 8.867403279881514e-21...","[[9.274610673991358e-07, 0.0002438866909152305...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5
3,5,"[[hotel, lodge], [staff], [nature]]",14.33841,"[[True, False, False, True, False], [False, Fa...","[1, 0, 4, 0, 3, 2, 0, 1, 0, 0, 4, 0, 4, 0, 0, ...","[[0.000675428936708015, 1.0, 5.874546987592445...","[[0.0, 0.0005105964685640634, 4.77741167474052...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5
4,4,"[[hotel, lodge], [nature, beautiful], [breakfa...",13.616512,"[[True, False, True, False], [False, False, Fa...","[2, 0, 0, 0, 0, 1, 0, 3, 3, 0, 0, 0, 0, 1, 3, ...","[[2.1681666871403563e-19, 1.0, 1.0, 1.0, 1.0, ...","[[0.0, 0.0004732963146358142, 4.65524706903250...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5
5,5,"[[hotel, lodge], [nature, beautiful], [breakfa...",14.569744,"[[True, False, True, False, False], [False, Fa...","[2, 0, 4, 0, 0, 1, 4, 3, 3, 4, 4, 0, 4, 1, 3, ...","[[4.595070431147928e-08, 1.0, 8.83837432798248...","[[0.0, 0.0002733701384442919, 1.84170668594248...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5
6,5,"[[hotel, lodge], [aurora, northern lights]]",14.935271,"[[True, False, False, False, True], [False, Fa...","[1, 4, 4, 4, 4, 1, 3, 4, 2, 1, 4, 1, 2, 3, 3, ...","[[0.00011787584955745022, 6.924751116524441e-1...","[[0.0, 0.00019459935486793953, 0.0, 4.02261798...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5
7,6,"[[hotel, lodge], [aurora, northern lights]]",15.518759,"[[True, False, False, False, True, False], [Tr...","[1, 0, 0, 2, 0, 1, 0, 0, 4, 5, 4, 5, 5, 2, 0, ...","[[4.297547612002358e-27, 1.0, 1.0, 7.320295574...","[[0.0, 0.00027615930041775214, 7.5739327388305...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5
8,7,"[[hotel, lodge], [aurora, northern lights]]",16.81008,"[[True, False, False, True, True, False, False...","[2, 0, 0, 0, 3, 6, 0, 0, 1, 6, 3, 5, 2, 0, 0, ...","[[4.163679349699422e-30, 1.0, 1.0, 1.0, 3.3493...","[[0.0, 0.00031617683347760214, 6.3960752687036...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5
9,8,"[[hotel, lodge], [aurora, northern lights]]",17.540303,"[[True, False, False, False, False, True, True...","[4, 0, 0, 0, 6, 4, 0, 0, 2, 6, 6, 3, 1, 0, 0, ...","[[6.8489520677906e-08, 1.0, 1.0, 1.0, 1.133775...","[[0.0, 0.00037426124572499756, 7.1177424103968...","(1, 2)","[aa, aaa, aaa book, aaa discount, aaa rate, aa...",min_df=5


In [190]:
model_list = []
tcs_list = []
topics_list = []
tcs_list = []
p_y_list = []
tc_history_list = []
top_docs_list = []
for i in range(corex_df.shape[0]):
          
    topics = corex_df.iloc[i]["Topics"]
    anchors = corex_df.iloc[i]["Anchors"]
    ngrams = corex_df.iloc[i]["N-gram range"]
    if corex_df.iloc[i]["Other"] == "min_df=5":
        min_df = 5
        max_df = 0.5
    elif corex_df.iloc[i]["Other"] == "min_df=7, max_df=0.04":
        min_df = 7
        max_df = 0.4
    else:
        min_df = 0
        max_df = 0.5

    corpus = main_df['Cleaned review v2']
    tfidf = TfidfVectorizer(stop_words="english", max_df=max_df, min_df=min_df, max_features=None, ngram_range=ngrams, norm=None, binary=True, sublinear_tf=False)
    df_vectorized = tfidf.fit_transform(corpus)
    vocab = tfidf.get_feature_names()

    corex = ct.Corex(n_hidden=topics, seed=10)
    corex = corex.fit(df_vectorized, docs=corpus.values, words=vocab, anchors=anchors)
    model_list.append(corex)
    tcs_list.append(corex.tcs)
    p_y_list.append(corex.p_y_given_x)
    tc_history_list.append(corex.tc_history)
    topics = []
    top_docs_list.append(corex.get_top_docs())
    for i, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
        topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
        topics.append("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))
    topics_list.append(topics)

# New start

In [3]:
import pandas as pd
import numpy as np
import pickle
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
corpus = main_df['Cleaned review v2']
tfidf = TfidfVectorizer(stop_words="english", max_df=.5, min_df=5, ngram_range=(1, 2), binary=True)
df_vectorized = tfidf.fit_transform(corpus)
vocab = tfidf.get_feature_names()

In [4]:
new_corex_df = pd.DataFrame(columns=["Topics", "Anchors", "Topic words", "TC", "TCs", "Labels", "Clusters", "Alpha", "Mis", "P(y)|x", "Words", "N-gram range", "min_df", "max_df"])
for i in tqdm(range(3, 21)):
    corex = ct.Corex(n_hidden=i, seed=10)
    corex = corex.fit(df_vectorized, anchors=[["hotel", "lodge"], ["aurora", "northern lights"]], docs=corpus.values, words=vocab)
    topic_words = []
    for x, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
        topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
        topic_words.append("Topic #{}: {}".format(x+1, ", ".join(topic_ngrams)))

    corex_details = {"Topics": i, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "Topic words": topic_words, "TC": corex.tc, "TCs": corex.tcs, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "P(y)|x": corex.p_y_given_x, "Words": corex.words, "N-gram range": (1, 2), "min_df": 5, "max_df": 0.5}

    new_corex_df = new_corex_df.append(corex_details, ignore_index=True)

    with open("corex_df.pickle", "wb") as to_write:
        pickle.dump(new_corex_df, to_write)
    print(f'Finished with i: {i}')

  6%|▌         | 1/18 [04:27<1:15:46, 267.45s/it]Finished with i: 3
 11%|█         | 2/18 [09:32<1:14:20, 278.77s/it]Finished with i: 4
 17%|█▋        | 3/18 [15:12<1:14:14, 296.96s/it]Finished with i: 5
 22%|██▏       | 4/18 [21:24<1:14:33, 319.56s/it]Finished with i: 6
 28%|██▊       | 5/18 [28:13<1:15:02, 346.33s/it]Finished with i: 7
 33%|███▎      | 6/18 [34:49<1:12:15, 361.29s/it]Finished with i: 8
 39%|███▉      | 7/18 [39:33<1:01:59, 338.11s/it]Finished with i: 9
 44%|████▍     | 8/18 [45:04<55:59, 335.94s/it]  Finished with i: 10
 50%|█████     | 9/18 [47:48<42:41, 284.58s/it]Finished with i: 11
 56%|█████▌    | 10/18 [53:05<39:12, 294.04s/it]Finished with i: 12
 61%|██████    | 11/18 [59:46<38:03, 326.26s/it]Finished with i: 13
 67%|██████▋   | 12/18 [1:06:33<35:03, 350.54s/it]Finished with i: 14
 72%|███████▏  | 13/18 [1:13:57<31:32, 378.56s/it]Finished with i: 15
 78%|███████▊  | 14/18 [1:21:36<26:51, 402.78s/it]Finished with i: 16
 83%|████████▎ | 15/18 [1:29:22<21:04, 421

In [28]:
corpus = list(main_df['Cleaned review v2'])
tfidf = TfidfVectorizer(stop_words="english", max_df=.5, min_df=5, ngram_range=(1, 2), binary=True)
df_vectorized = tfidf.fit_transform(corpus)
vocab = tfidf.get_feature_names()


test = pd.DataFrame(columns=["Topics", "Anchors", "Topic words", "TC", "TCs", "Labels", "Clusters", "Alpha", "Mis", "P(y)|x", "Words", "N-gram range", "min_df", "max_df"])

corex = ct.Corex(n_hidden=9, seed=10)
corex = corex.fit(df_vectorized, anchors=[["hotel", "lodge"], ["aurora", "northern lights"]], words=vocab)
topic_words = []
for x, topic_ngrams in enumerate(corex.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    topic_words.append("Topic #{}: {}".format(x+1, ", ".join(topic_ngrams)))

corex_details = {"Topics": 9, "Anchors": [["hotel", "lodge"], ["aurora", "northern lights"]], "Topic words": topic_words, "TC": corex.tc, "TCs": corex.tcs, "Labels": corex.labels, "Clusters": corex.clusters, "Alpha": corex.alpha, "Mis": corex.mis, "P(y)|x": corex.p_y_given_x, "Words": corex.words, "N-gram range": (1, 2), "min_df": 5, "max_df": 0.5}

test = test.append(corex_details, ignore_index=True)

In [8]:
# topic_words

['Topic #1: credit card, credit, cancel, card, reservation, email, confirmation, cancellation, cancel reservation, refund',
 'Topic #2: fishing, fish, guide, catch, halibut, salmon, silver, captain, fishing trip, boat',
 'Topic #3: treat grand, unique locally, unpaved trail, pave unpaved, path pave, grand lobby, crowd talkeetna, deck include, town unique, chair glass',
 'Topic #4: meal photogenic, photogenic tasty, sparse hairdryers, unit similar, invite multiple, spa quality, decor sparse, configuration second, hairdryers coffee, toiletry spa',
 'Topic #5: caravan sonnet, blog caravan, sonnet',
 'Topic #6: egg, cereal, yogurt, fruit, juice, sausage, toast, bacon, biscuit, gravy',
 'Topic #7: ',
 'Topic #8: host, cabin, bb, wonderful, home, owner, beautiful',
 'Topic #9: rv, campground, site, rv park, hookup, gravel, park, sewer, dump station, camper']

In [None]:
topic_words

# Getting nouns and adjs as features

In [4]:
!pip install spacy[lookups]
import spacy
from collections import Counter
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [35]:
def get_nouns(df):

    reviews = list(df["Cleaned review v2"])
    review_nouns_list = []

    for review in tqdm(reviews):
        review = nlp(review)
        review_nouns = []
        for word in tqdm(review):
            if word.lemma_ != "ovid" and (word.pos_ == "NOUN" or word.pos_ == "PROPN"):
                review_nouns.append(word.lemma_)
        nouns_only = " ".join(review_nouns)
        print(nouns_only)
        review_nouns_list.append(nouns_only)

    df["Review nouns"] = review_nouns_list
    
    return df

In [None]:
def get_adjs(review):

    review = nlp(review)
    review_adjs = []
    for token in review:
        if token.lemma_ != "ovid" and (token.pos_ == "NOUN" or token.pos_ == "PROPN"):
            review_nouns.append(token.lemma_)

    return " ".join(review_adjs)

In [13]:
def get_nouns(review):

    review = nlp(review)
    review_nouns = []
    for token in review:
        if token.lemma_ != "ovid" and (token.pos_ == "NOUN" or token.pos_ == "PROPN"):
            review_nouns.append(token.lemma_)

    return " ".join(review_nouns)

In [230]:
def get_adjs(review):

    review = nlp(review)
    review_adjs = []
    for token in review:
        if token.pos_ == "ADJ":
            review_adjs.append(token.lemma_)

    return " ".join(review_adjs)

In [234]:
review_adjs_list = []
for i in range(main_df.shape[0]):
    # checker_list.append(i)
    print(i)
    review_adjs_list.append(get_adjs(main_df.iloc[i]["Cleaned review v2"]))

9
91180
91181
91182
91183
91184
91185
91186
91187
91188
91189
91190
91191
91192
91193
91194
91195
91196
91197
91198
91199
91200
91201
91202
91203
91204
91205
91206
91207
91208
91209
91210
91211
91212
91213
91214
91215
91216
91217
91218
91219
91220
91221
91222
91223
91224
91225
91226
91227
91228
91229
91230
91231
91232
91233
91234
91235
91236
91237
91238
91239
91240
91241
91242
91243
91244
91245
91246
91247
91248
91249
91250
91251
91252
91253
91254
91255
91256
91257
91258
91259
91260
91261
91262
91263
91264
91265
91266
91267
91268
91269
91270
91271
91272
91273
91274
91275
91276
91277
91278
91279
91280
91281
91282
91283
91284
91285
91286
91287
91288
91289
91290
91291
91292
91293
91294
91295
91296
91297
91298
91299
91300
91301
91302
91303
91304
91305
91306
91307
91308
91309
91310
91311
91312
91313
91314
91315
91316
91317
91318
91319
91320
91321
91322
91323
91324
91325
91326
91327
91328
91329
91330
91331
91332
91333
91334
91335
91336
91337
91338
91339
91340
91341
91342
91343
91344
91345
91

In [236]:
len(review_adjs_list)

94513

In [238]:
full_df["Review adjs"] = review_adjs_list

In [218]:
main_df

Unnamed: 0,Property name,Property address,"Lat, long",Reviewer,Review date,Date of stay,Review title,Full review,Review link,Spell-checked review,Cleaned review,Month of stay,Year of stay,Vader +,Vader -,Cleaned review v2,Review nouns
0,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",John M,October 2020,October 2020,Awesome hotel doing all the right things,I stay at lots of hotels (including during Cov...,https://www.tripadvisor.com/ShowUserReviews-g6...,I stay at lots of hotels including during ovid...,stay lot hotel include ovid harbour right grea...,10.0,2020.0,0.250,0.000,stay lot hotel include ovid harbour right grea...,lot hotel harbour hotel price excellent locati...
1,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",Kenai Drift Anglers,October 2020,October 2020,Great Local Get Away,My girlfriend just wanted to get away from tow...,https://www.tripadvisor.com/ShowUserReviews-g6...,My girlfriend just wanted to get away from tow...,girlfriend want get away town enjoy great even...,10.0,2020.0,0.480,0.049,girlfriend want away town enjoy great evening ...,girlfriend town evening restriction hotel brea...
2,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",TravelGirl2006,October 2020,October 2020,We love this hotel,Have stayed here twice on our last two trips t...,https://www.tripadvisor.com/ShowUserReviews-g6...,Have stayed here twice on our last two trips t...,stay twice last two trip seward view marina fa...,10.0,2020.0,0.480,0.000,stay twice trip seward view marina fantastic r...,trip seward view marina room spotless bathroom...
3,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",maduro44,October 2020,September 2020,Bed,Everything was really nice almost great if it ...,https://www.tripadvisor.com/ShowUserReviews-g6...,Everything was really nice almost great if it ...,everything really nice almost great haunt bed ...,9.0,2020.0,0.440,0.158,really nice great haunt bed bed horrible mattr...,haunt bed bed horrible mattress help year bed
4,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",rtfields2020,October 2020,September 2020,Great place,"Terrific place to stay. Comfortable and clean,...",https://www.tripadvisor.com/ShowUserReviews-g6...,Terrific place to stay. Comfortable and clean ...,terrific place clean friendly staff excellent ...,9.0,2020.0,0.401,0.000,terrific place clean friendly staff excellent ...,place staff breakfast sit balcony mountain har...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111306,Anchorage Suite Lodge,"441 E 15th Ave, Anchorage, AK 99501-5211","61.207958,-149.875854",BigBubbaSpokane,August 2007,August 2007,Horror in Anchorage,"This is the most disgusting, filthy, unsanitar...",https://www.tripadvisor.com/ShowUserReviews-g6...,This is the most disgusting filthy unsanitary ...,disgust filthy unsanitary rundown hole ever en...,8.0,2007.0,0.047,0.391,disgust filthy unsanitary rundown hole encount...,disgust filthy rundown hole encounter daughter...
111307,Anchorage Suite Lodge,"441 E 15th Ave, Anchorage, AK 99501-5211","61.207958,-149.875854",Angeles,July 2007,July 2007,Do not stay here,This hotel was booked for us as part of a grou...,https://www.tripadvisor.com/ShowUserReviews-g6...,This hotel was booked for us as part of a grou...,hotel book us part group three couple suppose ...,7.0,2007.0,0.000,0.138,hotel book group couple suppose stay night cou...,hotel book group couple stay night couple plac...
111308,Anchorage Suite Lodge,"441 E 15th Ave, Anchorage, AK 99501-5211","61.207958,-149.875854",754Lynn,July 2005,July 2005,Misleading-- Don't fall for this dump of a Hot...,I booked a hotel on the road using a well know...,https://www.tripadvisor.com/ShowUserReviews-g6...,I booked a hotel on the road using a well know...,book hotel road use well know hotel website bo...,7.0,2005.0,0.211,0.024,book hotel road use know hotel website book ho...,book hotel road use know hotel website book ho...
111309,Anchorage Suite Lodge,"441 E 15th Ave, Anchorage, AK 99501-5211","61.207958,-149.875854",Trek36846,June 2005,May 2005,Dump,This hotel was rated 3 star. What a joke! When...,https://www.tripadvisor.com/ShowUserReviews-g6...,This hotel was rated 3 star. What a joke! When...,hotel rate star joke try contact hotel phone d...,5.0,2005.0,0.074,0.118,hotel rate star joke try contact hotel phone d...,hotel rate star joke contact hotel phone day a...


In [229]:
len(full_df["Full review"].unique())

93439

In [241]:
with open("newest_main_df.pickle", "wb") as to_write:
     pickle.dump(full_df, to_write)

In [240]:
full_df

Unnamed: 0,Property name,Property address,"Lat, long",Reviewer,Review date,Date of stay,Review title,Full review,Review link,Spell-checked review,Cleaned review,Month of stay,Year of stay,Vader +,Vader -,Cleaned review v2,Review length,Review nouns,Review adjs
0,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",John M,October 2020,October 2020,Awesome hotel doing all the right things,I stay at lots of hotels (including during Cov...,https://www.tripadvisor.com/ShowUserReviews-g6...,I stay at lots of hotels including during ovid...,stay lot hotel include ovid harbour right grea...,10.0,2020.0,0.250,0.000,stay lot hotel include ovid harbour right grea...,97,lot hotel harbour hotel price excellent locati...,right great fair friendly clean reasonable
1,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",Kenai Drift Anglers,October 2020,October 2020,Great Local Get Away,My girlfriend just wanted to get away from tow...,https://www.tripadvisor.com/ShowUserReviews-g6...,My girlfriend just wanted to get away from tow...,girlfriend want get away town enjoy great even...,10.0,2020.0,0.480,0.049,girlfriend want away town enjoy great evening ...,52,girlfriend town evening restriction hotel brea...,great covid great
2,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",TravelGirl2006,October 2020,October 2020,We love this hotel,Have stayed here twice on our last two trips t...,https://www.tripadvisor.com/ShowUserReviews-g6...,Have stayed here twice on our last two trips t...,stay twice last two trip seward view marina fa...,10.0,2020.0,0.480,0.000,stay twice trip seward view marina fantastic r...,100,trip seward view marina room spotless bathroom...,fantastic large good adapt covid friendly grea...
3,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",maduro44,October 2020,September 2020,Bed,Everything was really nice almost great if it ...,https://www.tripadvisor.com/ShowUserReviews-g6...,Everything was really nice almost great if it ...,everything really nice almost great haunt bed ...,9.0,2020.0,0.440,0.158,really nice great haunt bed bed horrible mattr...,54,haunt bed bed horrible mattress help year bed,nice great
4,Harbor 360 Hotel,"1412 4th Ave., Seward, AK 99664","60.119797,-149.439346",rtfields2020,October 2020,September 2020,Great place,"Terrific place to stay. Comfortable and clean,...",https://www.tripadvisor.com/ShowUserReviews-g6...,Terrific place to stay. Comfortable and clean ...,terrific place clean friendly staff excellent ...,9.0,2020.0,0.401,0.000,terrific place clean friendly staff excellent ...,45,place staff breakfast sit balcony mountain har...,terrific clean friendly excellent incredible r...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111306,Anchorage Suite Lodge,"441 E 15th Ave, Anchorage, AK 99501-5211","61.207958,-149.875854",BigBubbaSpokane,August 2007,August 2007,Horror in Anchorage,"This is the most disgusting, filthy, unsanitar...",https://www.tripadvisor.com/ShowUserReviews-g6...,This is the most disgusting filthy unsanitary ...,disgust filthy unsanitary rundown hole ever en...,8.0,2007.0,0.047,0.391,disgust filthy unsanitary rundown hole encount...,77,disgust filthy rundown hole encounter daughter...,unsanitary late gross terrible bad
111307,Anchorage Suite Lodge,"441 E 15th Ave, Anchorage, AK 99501-5211","61.207958,-149.875854",Angeles,July 2007,July 2007,Do not stay here,This hotel was booked for us as part of a grou...,https://www.tripadvisor.com/ShowUserReviews-g6...,This hotel was booked for us as part of a grou...,hotel book us part group three couple suppose ...,7.0,2007.0,0.000,0.138,hotel book group couple suppose stay night cou...,153,hotel book group couple stay night couple plac...,awful second denali old large denali
111308,Anchorage Suite Lodge,"441 E 15th Ave, Anchorage, AK 99501-5211","61.207958,-149.875854",754Lynn,July 2005,July 2005,Misleading-- Don't fall for this dump of a Hot...,I booked a hotel on the road using a well know...,https://www.tripadvisor.com/ShowUserReviews-g6...,I booked a hotel on the road using a well know...,book hotel road use well know hotel website bo...,7.0,2005.0,0.211,0.024,book hotel road use know hotel website book ho...,149,book hotel road use know hotel website book ho...,fancy decent clean affordable continental loca...
111309,Anchorage Suite Lodge,"441 E 15th Ave, Anchorage, AK 99501-5211","61.207958,-149.875854",Trek36846,June 2005,May 2005,Dump,This hotel was rated 3 star. What a joke! When...,https://www.tripadvisor.com/ShowUserReviews-g6...,This hotel was rated 3 star. What a joke! When...,hotel rate star joke try contact hotel phone d...,5.0,2005.0,0.074,0.118,hotel rate star joke try contact hotel phone d...,64,hotel rate star joke contact hotel phone day a...,light bad
