# Set-up
I built my project around data scraped from Tripadvisor and specifically from listings of (~) every Alaska property listed on the site. Below, I've pieced together my workflow as best I can, and hopefully in a way that can be reasonably understood by anyone reading it.

# Imports

In [None]:
import pandas as pd
import numpy as np
import time
import requests
import re
from bs4 import BeautifulSoup
import pickle
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os
from fake_useragent import UserAgent
import re
import pickle
import datetime
from selenium.webdriver.common.action_chains import ActionChains
from Functions as akf
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.tokenize import RegexpTokenizer
import re
import nltk
from nltk.corpus import stopwords
import itertools
import string
from tqdm import tqdm
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('stopwords')
import pkg_resources
from symspellpy import SymSpell, Verbosity
import spacy
from collections import Counter
python -m spacy download en_core_web_sm


ua = UserAgent()
user_agent = {'User-agent': ua.random}
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Building main dataset
I first collected URLs of all and then passed the list of scraped URLs to the main review-scraping function to collect my data. In navigating Selenium hiccups I skipped the following indices in my URL list: 138, 165, 188, 228, 324, 348, 349, 350, 351, 352, 431, 453, and 488.

In [None]:
# Getting URLs
akf.get_reviews_hotel_urls()

In [None]:
with open("hotel_urls.pickle", "rb") as to_read:
    url_list = pickle.load(to_read)

In [None]:
# Getting reviews
get_reviews(url_list)

In [None]:
with open("alaska_hotels_df.pickle", "rb") as to_read:
    main_df = pickle.load(to_read)

# EDA
The below illustrates a few of the many steps I took in examining for the first time the data I'd scraped as a whole.

In [None]:
# Dropping all the rows that for whatever reason do not have full reviews
main_df = main_df[main_df['Full review'].notna()]

In [None]:
# Quick look at the number of words in the corpus at this point...
X = main_df["Full review"]
all_strings = " ".join(X.values)
splits = all_strings.split()
print(f'Number of strings: {len(splits)}')
print(f'Number of unique strings: {len(set(splits))}')

In [None]:
# ...and at the top-appearing words (to be repeated many times after further pre-processing)...
freq_splits = FreqDist(splits)
print(f'25 most common strings: \n{freq_splits.most_common(25)}')

In [None]:
# ...and at the top-appearing words < 4 characters (to be repeated many times after further pre-processing)...
short = set(s for s in splits if len(s) < 4)
short = [(s, freq_splits[s]) for s in short]
short.sort(key=lambda x:x[1], reverse=True)
print(f'25 most common short strings:{short[:25]}')

In [None]:
# ...and at the top-appearing words > 15 characters (to be repeated many times after further pre-processing)...
long = set(s for s in splits if len(s) > 15)
long = [(s, freq_splits[s]) for s in long]
long.sort(key=lambda x:x[1], reverse=True)
print(f'25 most common short strings:{long[:25]}')

In [None]:
# Identifying all digits in the corpus
akf.summarize(r"\d", splits, freq_splits)

In [None]:
# Identifying all words with - in the corpus
akf.summarize(r"\w+-+\w+", splits, freq_splits)

In [None]:
# Identifying all words with extra intentional letters
outlaws = [s for s in splits if akf.find_outlaw(s)]
print("{} strings which is {:.2%} of total".format(len(outlaws), len(outlaws) / len(splits)))
outlaw_freq = [(s, freq_splits[s]) for s in set(outlaws)]
outlaw_freq.sort(key=lambda x:x[1], reverse=True)
print(outlaw_freq)

# Cleaning/pre-processing
Below are the primary means by which I cleaned up my corpus ahead of topic modeling.

In [None]:
# Spell-checking and -replacing
main_df = akf.spell_checker(all_hotels_df, pickling=True)

In [None]:
# Cleaning (using Regex) and lemmatizing review
main_df = akf.review_cleaner(main_df)

In [None]:
# Cleaning reviews further
main_df["Cleaned review v2"] = main_df["Cleaned review"].apply(lambda x: akf.clean_again(x))

In [None]:
# Cleaning reviews even further
main_df["Cleaned review v3"] = main_df["Cleaned review v2"].apply(lambda x: akf.third_clean(x))

In [None]:
# Cleaning city names from reviews
cities = []
for address in tqdm(list(main_df["Property address"].unique())):
    try:
        cities.append(address.split(', ')[1])
    except AttributeError:
        continue
cities = [city for city in cities if cities.count(city) > 15]
cities = list(set(cities))
cities = [city.lower() for city in cities]
cities.remove("king salmon")
main_df["Clean review no cities"] = main_df["Cleaned review v3"].apply(lambda x: akf.city_clean(x))

In [None]:
# For adding a column of reviews as only nouns
review_nouns_list = []
for i in range(main_df.shape[0]):
    review_nouns_list.append(akf.get_nouns(main_df.iloc[i]["Clean review no cities"]))
full_df["Review nouns"] = review_nouns_list

In [None]:
# Adding a column of reviews as only adjectives
review_adjs_list = []
for i in range(main_df.shape[0]):
    review_adjs_list.append(akf.get_adjs(main_df.iloc[i]["Clean review no cities"]))
full_df["Review adjs"] = review_adjs_list

In [None]:
# For adding a "Review length" column
main_df["Review length"] = main_df["Full review"].apply(lambda x: akf.review_length(x))

# Adding metadata, etc., to main_df
At a number of points in my process I added metadata to and cleaned metadata in my main dataset. Among them:

In [None]:
# Adding columns of type datetime for year and month
main_df["Month of stay"] = pd.DatetimeIndex(main_df["Date of stay"]).month
main_df["Year of stay"] = pd.DatetimeIndex(main_df["Date of stay"]).year

In [None]:
# Splitting "Lat, long" and adding "Latitude" and "Longitude" coumns with float values
for_mapping_df = main_df[main_df["Lat, long"].notna()]
lats = []
longs = []
for i in range(for_mapping_df.shape[0]):
    lats.append(float(for_mapping_df.iloc[i]["Lat, long"].split(",")[0]))
    longs.append(float(for_mapping_df.iloc[i]["Lat, long"].split(",")[1]))
for_mapping_df["Latitude"] = lats
for_mapping_df["Longitude"] = longs

# Pickling

In [None]:
with open("alaska_hotels_df.pickle", "wb") as to_write:
    pickle.dump(main_df, to_write)