# Sentiment analysis with hotel reviews

In [4]:
# Load the hotel reviews from CSV
import pandas as pd
import time
# importing time so the start and end time can be used to calculate file loading time
print("Loading data file now, this could take a while depending on file size")
start = time.time()
# df is 'DataFrame' - make sure you downloaded the file to the data folder
df = pd.read_csv('Hotel_Reviews.csv')
end = time.time()
print("Loading took " + str(round(end - start, 2)) + " seconds")

Loading data file now, this could take a while depending on file size
Loading took 14.41 seconds


In [8]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags',
       'days_since_review', 'lat', 'lng'],
      dtype='object')

In [9]:

def replace_address(row):
    if "Netherlands" in row["Hotel_Address"]:
              return "Amsterdam, Netherlands"
    elif "Barcelona" in row["Hotel_Address"]:
              return "Barcelona, Spain"
    elif "United Kingdom" in row["Hotel_Address"]:
              return "London, United Kingdom"
    elif "Milan" in row["Hotel_Address"]:        
              return "Milan, Italy"
    elif "France" in row["Hotel_Address"]:
              return "Paris, France"
    elif "Vienna" in row["Hotel_Address"]:
              return "Vienna, Austria" 
      
      # Replace all the addresses with a shortened, more useful form
    df["Hotel_Address"] = df.apply(replace_address, axis = 1)
      # The sum of the value_counts() should add up to the total number of reviews
    print(df["Hotel_Address"].value_counts())


In [10]:
display(df.groupby("Hotel_Address").agg({"Hotel_Name": "nunique"}))


Unnamed: 0_level_0,Hotel_Name
Hotel_Address,Unnamed: 1_level_1
s Gravesandestraat 55 Oost 1092 AA Amsterdam Netherlands,1
1 15 Templeton Place Earl s Court Kensington and Chelsea London SW5 9NB United Kingdom,1
1 2 Serjeant s Inn Fleet Street City of London London EC4Y 1LL United Kingdom,1
1 3 Queens Garden Westminster Borough London W2 3BA United Kingdom,1
1 3 Rue d Argentine 16th arr 75116 Paris France,1
...,...
Wiedner Hauptstra e 44 04 Wieden 1040 Vienna Austria,1
Wildpretmarkt 5 01 Innere Stadt 1010 Vienna Austria,1
Wilton Place Knightsbridge Westminster Borough London SW1X 7RL United Kingdom,1
Wrights Lane Kensington and Chelsea London W8 5SP United Kingdom,1


In [11]:
 # Drop `Additional_Number_of_Scoring`
df.drop(["Additional_Number_of_Scoring"], axis = 1, inplace=True)
  # Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values
df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')
df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)

In [12]:
# Remove opening and closing brackets
df.Tags = df.Tags.str.strip("[']")
# remove all quotes too
df.Tags = df.Tags.str.replace(" ', '", ",", regex = False)

In [13]:
# Process the Tags into new columns
# The file Hotel_Reviews_Tags.py, identifies the most important tags
# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, 
# Family with young children, Family with older children, With a pet
df["Leisure_trip"] = df.Tags.apply(lambda tag: 1 if "Leisure trip" in tag else 0)
df["Couple"] = df.Tags.apply(lambda tag: 1 if "Couple" in tag else 0)
df["Solo_traveler"] = df.Tags.apply(lambda tag: 1 if "Solo traveler" in tag else 0)
df["Business_trip"] = df.Tags.apply(lambda tag: 1 if "Business trip" in tag else 0)
df["Group"] = df.Tags.apply(lambda tag: 1 if "Group" in tag or "Travelers with friends" in tag else 0)
df["Family_with_young_children"] = df.Tags.apply(lambda tag: 1 if "Family with young children" in tag else 0)
df["Family_with_older_children"] = df.Tags.apply(lambda tag: 1 if "Family with older children" in tag else 0)
df["With_a_pet"] = df.Tags.apply(lambda tag: 1 if "With a pet" in tag else 0)