In [1]:
import numpy as np
import pandas as pd

# pd.set_option('max_columns', None)
# pd.set_option('max_rows', None)\n",

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import en_core_web_lg
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

from textblob import TextBlob

from datetime import datetime

In [3]:
data = pd.read_csv('all_hotels_cleaned.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,rewards,brand,name,rating,description,street,city,state,postal,url,class_id,category
0,0,Hilton,Hampton Inn,Hampton Inn Alexander City,4.0,"We’re right off Highway 280, 25 minutes away f...",1551 Elkahatchee Road,Alexander City,AL,35010,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale
1,1,Hilton,Hampton Inn,Hampton Inn Wetumpka,5.0,"We’re on the banks of the Coosa River, a short...",350 South Main Street,Wetumpka,AL,36092,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale
2,2,Hilton,Hampton Inn,Hampton Inn Auburn,4.0,"We're off I-85, under 10 minutes from Chewacla...",2430 S. College St.,Auburn,AL,36832,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale
3,3,Hilton,Tru,Tru by Hilton Auburn,4.0,"Make this Tru by Hilton your own. We’ll help, ...",2411 W Pace Blvd,Auburn,AL,36830,https://tru3.hilton.com/en/hotels/alabama/tru-...,5,Budget
4,4,Hilton,Hilton Garden Inn,Hilton Garden Inn Auburn/Opelika,4.0,"Our hotel is just off I-85, minutes away from ...",2555 Hilton Garden Drive,Auburn,AL,36830,https://hiltongardeninn3.hilton.com/en/hotels/...,3,Full-Service Entry


In [5]:
bagofwords = []
columns = ['rewards', 'brand', 'description', 'city', 'category']
for index, row in data.iterrows():
    words = ''
    for col in columns:
        words += ''.join(row[col]) + ' '
    bagofwords.append(words)

data['Bag_of_words'] = bagofwords

In [6]:
nlp = en_core_web_lg.load()

In [7]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,rewards,brand,name,rating,description,street,city,state,postal,url,class_id,category,Bag_of_words
0,0,Hilton,Hampton Inn,Hampton Inn Alexander City,4.0,"We’re right off Highway 280, 25 minutes away f...",1551 Elkahatchee Road,Alexander City,AL,35010,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,Hilton Hampton Inn We’re right off Highway 280...
1,1,Hilton,Hampton Inn,Hampton Inn Wetumpka,5.0,"We’re on the banks of the Coosa River, a short...",350 South Main Street,Wetumpka,AL,36092,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,Hilton Hampton Inn We’re on the banks of the C...
2,2,Hilton,Hampton Inn,Hampton Inn Auburn,4.0,"We're off I-85, under 10 minutes from Chewacla...",2430 S. College St.,Auburn,AL,36832,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,"Hilton Hampton Inn We're off I-85, under 10 mi..."
3,3,Hilton,Tru,Tru by Hilton Auburn,4.0,"Make this Tru by Hilton your own. We’ll help, ...",2411 W Pace Blvd,Auburn,AL,36830,https://tru3.hilton.com/en/hotels/alabama/tru-...,5,Budget,Hilton Tru Make this Tru by Hilton your own. W...
4,4,Hilton,Hilton Garden Inn,Hilton Garden Inn Auburn/Opelika,4.0,"Our hotel is just off I-85, minutes away from ...",2555 Hilton Garden Drive,Auburn,AL,36830,https://hiltongardeninn3.hilton.com/en/hotels/...,3,Full-Service Entry,Hilton Hilton Garden Inn Our hotel is just off...


In [8]:
# Creating spacydocs for each bag of words for each hotel
spacydocs = []
for x in data['Bag_of_words']:
    doc = nlp(x)
    spacydocs.append(doc)

data['spacydocs'] = spacydocs

In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,rewards,brand,name,rating,description,street,city,state,postal,url,class_id,category,Bag_of_words,spacydocs
0,0,Hilton,Hampton Inn,Hampton Inn Alexander City,4.0,"We’re right off Highway 280, 25 minutes away f...",1551 Elkahatchee Road,Alexander City,AL,35010,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,Hilton Hampton Inn We’re right off Highway 280...,"(Hilton, Hampton, Inn, We, ’re, right, off, Hi..."
1,1,Hilton,Hampton Inn,Hampton Inn Wetumpka,5.0,"We’re on the banks of the Coosa River, a short...",350 South Main Street,Wetumpka,AL,36092,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,Hilton Hampton Inn We’re on the banks of the C...,"(Hilton, Hampton, Inn, We, ’re, on, the, banks..."
2,2,Hilton,Hampton Inn,Hampton Inn Auburn,4.0,"We're off I-85, under 10 minutes from Chewacla...",2430 S. College St.,Auburn,AL,36832,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,"Hilton Hampton Inn We're off I-85, under 10 mi...","(Hilton, Hampton, Inn, We, 're, off, I-85, ,, ..."
3,3,Hilton,Tru,Tru by Hilton Auburn,4.0,"Make this Tru by Hilton your own. We’ll help, ...",2411 W Pace Blvd,Auburn,AL,36830,https://tru3.hilton.com/en/hotels/alabama/tru-...,5,Budget,Hilton Tru Make this Tru by Hilton your own. W...,"(Hilton, Tru, Make, this, Tru, by, Hilton, you..."
4,4,Hilton,Hilton Garden Inn,Hilton Garden Inn Auburn/Opelika,4.0,"Our hotel is just off I-85, minutes away from ...",2555 Hilton Garden Drive,Auburn,AL,36830,https://hiltongardeninn3.hilton.com/en/hotels/...,3,Full-Service Entry,Hilton Hilton Garden Inn Our hotel is just off...,"(Hilton, Hilton, Garden, Inn, Our, hotel, is, ..."


In [10]:
# Creating vectors for the bag of words for each hotel
vectorlist = []
for doc in data['spacydocs']:
    vector = doc.vector
    vectorlist.append(vector)
data['vectors'] = vectorlist

In [13]:
data.shape

(21248, 16)

In [53]:
data.to_csv('data_for_app6.csv')

In [2]:
# data = pd.read_csv('data_for_app5.csv', delimiter='\t')

In [14]:
# creating a full address column to use for getting geolocations
addresses = []
for index, row in data.iterrows():
    street = row['street']
    city = row['city']
    state = row['state']
    postal = row['postal']
    address = street +', ' + city + ', ' + state+ ' ' + postal
    addresses.append(address)
data['address'] = addresses

In [15]:
addresses2 = []
for index, row in data.iterrows():
    city = row['city']
    state = row['state']
    postal = row['postal']
    address = city + ', ' + state+ ' ' + postal
    addresses2.append(address)
data['address2'] = addresses2

In [16]:
data.head()

Unnamed: 0.1,Unnamed: 0,rewards,brand,name,rating,description,street,city,state,postal,url,class_id,category,Bag_of_words,spacydocs,vectors,address,address2
0,0,Hilton,Hampton Inn,Hampton Inn Alexander City,4.0,"We’re right off Highway 280, 25 minutes away f...",1551 Elkahatchee Road,Alexander City,AL,35010,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,Hilton Hampton Inn We’re right off Highway 280...,"(Hilton, Hampton, Inn, We, ’re, right, off, Hi...","[0.110708795, 0.18874672, -0.016256696, -0.054...","1551 Elkahatchee Road, Alexander City, AL 35010","Alexander City, AL 35010"
1,1,Hilton,Hampton Inn,Hampton Inn Wetumpka,5.0,"We’re on the banks of the Coosa River, a short...",350 South Main Street,Wetumpka,AL,36092,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,Hilton Hampton Inn We’re on the banks of the C...,"(Hilton, Hampton, Inn, We, ’re, on, the, banks...","[0.022146616, 0.18830003, -0.057727393, -0.104...","350 South Main Street, Wetumpka, AL 36092","Wetumpka, AL 36092"
2,2,Hilton,Hampton Inn,Hampton Inn Auburn,4.0,"We're off I-85, under 10 minutes from Chewacla...",2430 S. College St.,Auburn,AL,36832,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,"Hilton Hampton Inn We're off I-85, under 10 mi...","(Hilton, Hampton, Inn, We, 're, off, I-85, ,, ...","[0.032046665, 0.18124907, 0.040364493, -0.0403...","2430 S. College St., Auburn, AL 36832","Auburn, AL 36832"
3,3,Hilton,Tru,Tru by Hilton Auburn,4.0,"Make this Tru by Hilton your own. We’ll help, ...",2411 W Pace Blvd,Auburn,AL,36830,https://tru3.hilton.com/en/hotels/alabama/tru-...,5,Budget,Hilton Tru Make this Tru by Hilton your own. W...,"(Hilton, Tru, Make, this, Tru, by, Hilton, you...","[0.025436003, 0.105327815, -0.09880912, -0.104...","2411 W Pace Blvd, Auburn, AL 36830","Auburn, AL 36830"
4,4,Hilton,Hilton Garden Inn,Hilton Garden Inn Auburn/Opelika,4.0,"Our hotel is just off I-85, minutes away from ...",2555 Hilton Garden Drive,Auburn,AL,36830,https://hiltongardeninn3.hilton.com/en/hotels/...,3,Full-Service Entry,Hilton Hilton Garden Inn Our hotel is just off...,"(Hilton, Hilton, Garden, Inn, Our, hotel, is, ...","[0.12775657, 0.13718104, -0.0058974824, -0.131...","2555 Hilton Garden Drive, Auburn, AL 36830","Auburn, AL 36830"


In [48]:
dataclean = data.drop(columns=['Unnamed: 0', 'street', 'brand', 'city', 'postal', 'Bag_of_words', 'spacydocs', 'description'])

In [24]:
dataclean.to_csv('data_for_app6_slim.csv')

In [25]:
dataclean.head()

Unnamed: 0,rewards,name,rating,url,class_id,category,vectors,address,address2
0,Hilton,Hampton Inn Alexander City,4.0,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,"[0.110708795, 0.18874672, -0.016256696, -0.054...","1551 Elkahatchee Road, Alexander City, AL 35010","Alexander City, AL 35010"
1,Hilton,Hampton Inn Wetumpka,5.0,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,"[0.022146616, 0.18830003, -0.057727393, -0.104...","350 South Main Street, Wetumpka, AL 36092","Wetumpka, AL 36092"
2,Hilton,Hampton Inn Auburn,4.0,https://hamptoninn3.hilton.com/en/hotels/alaba...,4,Limited-Service Mid-Scale,"[0.032046665, 0.18124907, 0.040364493, -0.0403...","2430 S. College St., Auburn, AL 36832","Auburn, AL 36832"
3,Hilton,Tru by Hilton Auburn,4.0,https://tru3.hilton.com/en/hotels/alabama/tru-...,5,Budget,"[0.025436003, 0.105327815, -0.09880912, -0.104...","2411 W Pace Blvd, Auburn, AL 36830","Auburn, AL 36830"
4,Hilton,Hilton Garden Inn Auburn/Opelika,4.0,https://hiltongardeninn3.hilton.com/en/hotels/...,3,Full-Service Entry,"[0.12775657, 0.13718104, -0.0058974824, -0.131...","2555 Hilton Garden Drive, Auburn, AL 36830","Auburn, AL 36830"


In [27]:
data['category'].unique()

array(['Limited-Service Mid-Scale', 'Budget', 'Full-Service Entry',
       'Extended-Stay, All-Suite', 'Full-Service Mid-Scale',
       'Lifestyle, Boutique', 'Full-Service Upper-Scale',
       'Vacation Club, Condos, Villas', 'Luxury'], dtype=object)

In [42]:
catreplace = {'Limited-Service Mid-Scale' : 8, 'Budget' : 9, 'Full-Service Entry' : 6,
       'Extended-Stay, All-Suite' : 7, 'Full-Service Mid-Scale' : 5,
       'Lifestyle, Boutique' : 3, 'Full-Service Upper-Scale' : 4,
       'Vacation Club, Condos, Villas' : 2, 'Luxury' : 1}

In [43]:
dataclean['category'] = dataclean['category'].replace(catreplace)

In [31]:
dataclean.to_csv('data_for_app6_slim.csv')

In [49]:
vectormeanlist = []
for x in dataclean['vectors']:
    x = x.mean()
    vectormeanlist.append(x)
dataclean['vectormean'] = vectormeanlist

In [50]:
dataclean = dataclean.drop(columns=['vectors', 'class_id'])

In [52]:
dataclean.to_csv('data_for_app6_slim.csv')