# Thesis - Data Cleaning

## 1. Import Needed Packages

In [2]:
import pickle
import json
import os
from collections import Counter
import itertools
import operator
from operator import itemgetter

## 2. Import Datasets

### 2.1 TripAdvisor

#### 2.2.1 Open all separate files, and add them together.

In [282]:
path_to_jsonfiles = 'TripAdvisorHotels'
allhotels_tripadvisor = []
for file in os.listdir(path_to_jsonfiles):
    full_filename = "%s/%s" % (path_to_jsonfiles, file)
    with open(full_filename,'r') as fi:
        data = json.load(fi)
        hotel = data['_source']
        allhotels_tripadvisor.append(hotel)

In [514]:
print('There are {} files, and the length of the combined files should be the same: {}'.format(len(allhotels_tripadvisor), (len(allhotels_tripadvisor))))

There are 402 files, and the length of the combined files should be the same: 402


In [515]:
allhotels_tripadvisor[0].keys()

dict_keys(['ranking', 'overall_rating', 'reviews', 'link', 'name', 'doctype', 'META', 'reviewquantity'])

In [516]:
allreviews = []
for hotel in allhotels_tripadvisor:
    allreviews.append(len(hotel['reviews']))
print('There are {} reviews'.format(sum(allreviews)))

There are 146585 reviews


### 2.2 Expedia

#### 2.2.1 Open all separate files, and add them together.

In [8]:
pkl_file1 = open('ExpediaPage1.pkl', 'rb')
expedia1 = pickle.load(pkl_file1)
pkl_file1.close()

In [9]:
pkl_file3 = open('ExpediaPage3.pkl', 'rb')
expedia3 = pickle.load(pkl_file3)
pkl_file3.close()

In [10]:
pkl_file4 = open('ExpediaPage4.pkl', 'rb')
expedia4 = pickle.load(pkl_file4)
pkl_file4.close()

In [11]:
pkl_file5 = open('ExpediaPage5.pkl', 'rb')
expedia5 = pickle.load(pkl_file5)
pkl_file5.close()

In [12]:
pkl_file6 = open('ExpediaPage6.pkl', 'rb')
expedia6 = pickle.load(pkl_file6)
pkl_file6.close()

In [13]:
allhotels_expedia = expedia1+expedia3+expedia4+expedia5+expedia6

In [14]:
print("The lengths of the separate datasets are:",len(expedia1),len(expedia3),len(expedia4),len(expedia5),len(expedia6),
      "\nTherefore, the length of the final dataset should have length:",(len(expedia1)+len(expedia3)+len(expedia4)+len(expedia5)+len(expedia6)),
      "\nWhich is:",len(allhotels_expedia)==(len(expedia1)+len(expedia3)+len(expedia4)+len(expedia5)+len(expedia6)))

The lengths of the separate datasets are: 54 51 54 54 54 
Therefore, the length of the final dataset should have length: 267 
Which is: True


In [15]:
allhotels_expedia[0].keys()

dict_keys(['stars', 'rating', 'name', 'url', 'review_quantity', 'doctype', 'reviews', 'id', 'META'])

## 3. Clean the Datasets

### 3.1 TripAdvisor

#### 3.1.1 Check for duplicates.

In [517]:
allhotel_names_trip = []
for hotel in allhotels_tripadvisor:
    allhotel_names_trip.append(hotel['name'])
Counter(allhotel_names_trip)

Counter({'377 House - Amsterdam': 1,
         'A-Train Hotel': 2,
         'Aalborg Hotel Amsterdam': 2,
         'Ambassade Hotel': 3,
         'Amstel Botel': 1,
         'Amsterdam Downtown Hotel': 1,
         'Amsterdam Hostel Centre': 1,
         'Amsterdam Hotel Parklane': 1,
         'Amsterdam House Hotel': 2,
         'Amsterdam Marriott Hotel': 1,
         'Amsterdam Teleport Hotel': 2,
         'Amsterdam Tropen Hotel': 1,
         'Amsterdam Wiechmann Hotel': 1,
         'Anco Hotel': 1,
         'Andaz Amsterdam Prinsengracht': 2,
         'Anna Youth Hostel': 1,
         'Annemarie Hotel': 1,
         'Apollo Hotel Amsterdam, a Tribute portfolio': 1,
         'Apollo Museumhotel Amsterdam City Centre': 1,
         'Apollofirst boutique hotel Amsterdam': 3,
         'Armada Hotel': 2,
         "Art'otel Amsterdam": 1,
         'Aston City Hotel': 1,
         'Atlanta Hotel': 1,
         'Avenue Hotel': 1,
         'BackStage Hotel Amsterdam': 1,
         'Banks Mansion': 1

There are duplicates in the dataset, these have to be removed. How many duplicates are there?

In [518]:
unique_hotels_trip = len(Counter(allhotel_names_trip))
seen_trip = set()
uniq_trip = []
not_uniq_trip = []
for hotel in allhotels_tripadvisor:
    if hotel['name'] not in seen_trip:
        uniq_trip.append(hotel['name'])
        seen_trip.add(hotel['name'])
    else:
        not_uniq_trip.append(hotel['name'])
print("There are {} unique hotels, which should be the same as all hotels calculated by Counter: {}. \nThere are {} hotels in the dataset.".format(len(uniq_trip),(unique_hotels_trip==len(uniq_trip)),len(allhotels_tripadvisor)))

There are 280 unique hotels, which should be the same as all hotels calculated by Counter: True. 
There are 402 hotels in the dataset.


In [519]:
shouldbechecked = []
shouldnotbechecked = []
correcthotels_tripadvisor = []
checkinghotels_tripadvisor = []
for hotel in allhotels_tripadvisor:
    if hotel['name'] in not_uniq_trip:
        shouldbechecked.append(hotel['name'])
        checkinghotels_tripadvisor.append(hotel)
    else:
        shouldnotbechecked.append(hotel['name'])
        correcthotels_tripadvisor.append(hotel)
len(shouldbechecked)

217

In [520]:
len(correcthotels_tripadvisor)

185

For all duplicate hotels, check whether they have the same amount or reviews:

In [521]:
for hotel in checkinghotels_tripadvisor:
    print(hotel['name'],len(hotel['reviews']))

INK Hotel Amsterdam - MGallery by Sofitel 700
Sir Adam Hotel 238
Luxury Suites Amsterdam 214
Bastion Hotel Amsterdam Amstel 290
Pillows Anna van den Vondel 88
XO Hotels Park West 700
European Apartments Reservations 19
Euphemia Hotel 116
Andaz Amsterdam Prinsengracht 700
The Arcade Hotel 59
Quentin England Hotel 330
Royal Plaza Hotel 12
Hotel Seven one Seven 402
Prinsengracht Hotel 282
Pulitzer Amsterdam 700
Hampton by Hilton Amsterdam / Arena Boulevard 700
Amsterdam Teleport Hotel 431
Ambassade Hotel 700
The Alfred Hotel 486
Hotel Nes 230
Euphemia Hotel 116
WestCord Fashion Hotel Amsterdam 700
Hotel Cordial 385
Ibis Styles Amsterdam Amstel 211
Hotel De Munck 107
Camp Inn Hotel 47
misc eatdrinksleep 592
Hotel Cordial 385
Dream Hotel Amsterdam 99
Nova Apartments Amsterdam 117
Hotel de Westertoren 68
A-Train Hotel 631
Apollofirst boutique hotel Amsterdam 188
Hotel Agora 188
France Hotel Amsterdam 124
Hampton by Hilton Amsterdam Centre East 302
DoubleTree by Hilton Hotel Amsterdam Centraa

All duplicate hotels have the same amount of reviews, so the most recent scraped hotel is selected to make sure we have the most recent data of that hotel. Order to data by date, and select the first occurence of the hotel. 

In [522]:
checkinghotels_tripadvisor.sort(key=lambda item:item['META']['ADDED'], reverse=True)
seen = []
included_hotels = []
deleted_hotels = []
for hotel in checkinghotels_tripadvisor:
    if hotel['name'] not in seen:
        seen.append(hotel['name'])
        included_hotels.append(hotel)
    else:
        deleted_hotels.append(hotel)
print("From the {} duplicate hotels, {} hotels are unique.".format(len(checkinghotels_tripadvisor), len(included_hotels)))

From the 217 duplicate hotels, 95 hotels are unique.


In [523]:
correcthotels_tripadvisor.extend(included_hotels)

In [524]:
print('There is a total of {} unique hotels that was scraped.'.format(len(correcthotels_tripadvisor)))

There is a total of 280 unique hotels that was scraped.


#### 3.1.2 Put all reviews from the hotels in one dataset.

In [525]:
for hotel in correcthotels_tripadvisor:
    print(len(hotel['reviews']),hotel['name'])

193 377 House - Amsterdam
166 Bastion Hotel Amsterdam Zuidwest
1 Tsjerk Hiddes Hotel Amsterdam
700 Park Plaza Amsterdam Airport
0 MOXY by Marriott Amsterdam Houthavens
554 Hampshire Hotel - Theatre District Amsterdam
513 Hotel Manofa
700 Bilderberg Garden Hotel
202 Hyatt Regency Amsterdam
1 Van der Valk Hotel Amsterdam-Amstel
700 Hotel Estherea
3 Hotel Nottinghill Amsterdam
219 Jupiter Hotel
74 Van Gelder Hotel
138 Hotel Hestia
1 Royal Tulip Amsterdam
127 Hotel Titus
50 Hotel Pax
700 Apollo Hotel Amsterdam, a Tribute portfolio
507 Hotel Museum Lane
63 Hotel Patou
178 Hotel Neutraal
18 Hotel Impala
700 Avenue Hotel
111 Hotel Monopole
700 NH Amsterdam Schiller
663 Singel Hotel Amsterdam
700 Hilton Amsterdam
16 Hotel Lowell
391 Hotel Brouwer
291 Urban Lodge Hotel
4 Anna Youth Hostel
1 J&B
700 Holiday Inn Express Amsterdam-Sloterdijk Station
36 Hemp Hotel
219 Hotel Continental
700 Amstel Botel
235 Hotel Hoksbergen
527 Swissotel Amsterdam
207 Princess Hostel Amsterdam
1 Hotel Oostzaan-Amste

In [526]:
allreviews_tripadvisor = []
amount_reviews_hotel = []
for hotel in correcthotels_tripadvisor:
    amount_reviews_hotel.append(len(hotel['reviews']))
    reviews = hotel['reviews']
    for review in reviews:
        review['hotel'] = hotel['name'] # add hotel name to be able to retrieve all reviews for a hotel
    allreviews_tripadvisor.extend(reviews)
print("There should be",sum(amount_reviews_hotel),"reviews, which is:",(len(allreviews_tripadvisor)==sum(amount_reviews_hotel)))

There should be 96993 reviews, which is: True


In [527]:
allreviews_tripadvisor[0].keys()

dict_keys(['headline', 'username', 'contributions', 'responder', 'images', 'rating', 'review', 'date_of_stay', 'mobile', 'votes', 'response', 'location', 'hotel', 'travel_company', 'date', 'specific_ratings', 'partnership', 'response_date'])

Delete empty reviews:

In [528]:
emptystring_trip = []
none_trip = []
unretrievable = []
for review in allreviews_tripadvisor:
    if review['review'] == "":
        emptystring_trip.append(review)
    if review['review'] is None:
        none_trip.append(review)
    if review['review'] == 'UNRETRIEVABLE REVIEW':
        unretrievable.append(review)
print('There are {} none type reviews, {} empty strings, and {} unretrievable reviews'.format(len(none_trip),len(emptystring_trip),len(unretrievable)))

There are 0 none type reviews, 0 empty strings, and 4229 unretrievable reviews


Is there bias in the unretrievable reviews?

In [529]:
sorted_unretrievable = sorted(unretrievable, key=itemgetter('hotel'))
for key, group in itertools.groupby(sorted_unretrievable, key=lambda x:x['hotel']):
    print(key,len(list(group)))

A-Train Hotel 53
Aalborg Hotel Amsterdam 39
Amstel Botel 14
Amsterdam Downtown Hotel 3
Amsterdam House Hotel 16
Amsterdam Teleport Hotel 1
Amsterdam Tropen Hotel 7
Amsterdam Wiechmann Hotel 58
Andaz Amsterdam Prinsengracht 121
Annemarie Hotel 6
Apollo Hotel Amsterdam, a Tribute portfolio 32
Apollofirst boutique hotel Amsterdam 4
Armada Hotel 23
Art'otel Amsterdam 218
Aston City Hotel 27
Atlanta Hotel 9
Avenue Hotel 12
BackStage Hotel Amsterdam 9
Bastion Hotel Amsterdam Amstel 11
Bastion Hotel Amsterdam Noord 10
Bastion Hotel Amsterdam Zuidwest 6
Belfort Hotel 16
Bema Rentals 26
Best Western Delphi Hotel 8
Bilderberg Garden Hotel 6
Bilderberg Hotel Jan Luyken 7
Campanile Amsterdam 2
Cityden Museum District City Suites 14
Corendon Vitality Hotel Amsterdam 1
Courtyard Amsterdam Arena Atlas 2
Crowne Plaza Amsterdam South 1
Dam Hotel 9
Delta Hotel Amsterdam City Centre 36
Die Port van Cleve 9
DoubleTree by Hilton Hotel Amsterdam - NDSM Wharf 1
Dutch Design Hotel Artemis 5
Euphemia Hotel 5
E

What is the percentage of unretrievable reviews per hotel?

In [530]:
high_unretrievables = []
for hotel in correcthotels_tripadvisor:
    amount_all = []
    amount_unretrievable = []
    amount_normal = []
    for review in hotel['reviews']:
        amount_all.append(review)
        if review['review'] == 'UNRETRIEVABLE REVIEW':
            amount_unretrievable.append(review)
        else:
            amount_normal.append(review)
    if len(amount_unretrievable) != 0:
        percentage_unretrievable = round(len(amount_unretrievable)/len(amount_all)*100)
        if percentage_unretrievable > 5:
            thishotel = {}
            thishotel['name'] = hotel['name']
            thishotel['percentage'] = percentage_unretrievable
            high_unretrievables.append(thishotel)
            print("{} has {}/{} unretrievable reviews and {}/{} retrievable reviews, which is {}%.".format(hotel['name'],len(amount_unretrievable),len(amount_all),len(amount_normal),len(amount_all),percentage_unretrievable))

Tsjerk Hiddes Hotel Amsterdam has 1/1 unretrievable reviews and 0/1 retrievable reviews, which is 100%.
Park Plaza Amsterdam Airport has 277/700 unretrievable reviews and 423/700 retrievable reviews, which is 40%.
Hampshire Hotel - Theatre District Amsterdam has 48/554 unretrievable reviews and 506/554 retrievable reviews, which is 9%.
Hotel Manofa has 72/513 unretrievable reviews and 441/513 retrievable reviews, which is 14%.
Hyatt Regency Amsterdam has 25/202 unretrievable reviews and 177/202 retrievable reviews, which is 12%.
Jupiter Hotel has 16/219 unretrievable reviews and 203/219 retrievable reviews, which is 7%.
Hotel Hestia has 24/138 unretrievable reviews and 114/138 retrievable reviews, which is 17%.
Royal Tulip Amsterdam has 1/1 unretrievable reviews and 0/1 retrievable reviews, which is 100%.
Hotel Titus has 15/127 unretrievable reviews and 112/127 retrievable reviews, which is 12%.
Hotel Patou has 12/63 unretrievable reviews and 51/63 retrievable reviews, which is 19%.
Ho

In [465]:
len(high_unretrievables)

[{'name': 'Tsjerk Hiddes Hotel Amsterdam', 'percentage': 100},
 {'name': 'Park Plaza Amsterdam Airport', 'percentage': 40},
 {'name': 'Hampshire Hotel - Theatre District Amsterdam', 'percentage': 9},
 {'name': 'Hotel Manofa', 'percentage': 14},
 {'name': 'Hyatt Regency Amsterdam', 'percentage': 12},
 {'name': 'Jupiter Hotel', 'percentage': 7},
 {'name': 'Hotel Hestia', 'percentage': 17},
 {'name': 'Royal Tulip Amsterdam', 'percentage': 100},
 {'name': 'Hotel Titus', 'percentage': 12},
 {'name': 'Hotel Patou', 'percentage': 19},
 {'name': 'Hotel Neutraal', 'percentage': 22},
 {'name': 'Singel Hotel Amsterdam', 'percentage': 7},
 {'name': 'J&B', 'percentage': 100},
 {'name': 'Hotel Hoksbergen', 'percentage': 8},
 {'name': 'Princess Hostel Amsterdam', 'percentage': 9},
 {'name': 'Hotel Damrak', 'percentage': 8},
 {'name': 'Cityden Museum District City Suites', 'percentage': 6},
 {'name': 'Trianon Hotel', 'percentage': 9},
 {'name': 'Hotel Di-Ann', 'percentage': 16},
 {'name': 'Thorbecke H

# WHAT TO DO WITH THIS?

Delete the unretrievable reviews:

In [531]:
print("There were {} reviews".format(len(allreviews_tripadvisor)))
allreviews_tripadvisor[:] = [review for review in allreviews_tripadvisor if review.get('review') != 'UNRETRIEVABLE REVIEW']
print("There are now {} reviews.".format(len(allreviews_tripadvisor)))

There were 96993 reviews
There are now 92764 reviews.


In [467]:
seen_review = []
included_reviews_trip = []
deleted_reviews_trip = []
for review in allreviews_tripadvisor:
    if review['review'] not in seen_review:
        seen_review.append(review['review'])
        included_reviews_trip.append(review)
    else:
        deleted_reviews_trip.append(review)
print("From the {} Expedia reviews, {} reviews are unique.".format(len(allreviews_tripadvisor), len(included_reviews_trip)))

From the 92764 Expedia reviews, 92751 reviews are unique.


In [533]:
len(included_reviews_trip)

92751

In [534]:
amount_hotels_trip = []
sorted_included_reviews = sorted(included_reviews_trip, key=itemgetter('hotel'))
for key, group in itertools.groupby(sorted_included_reviews, key=lambda x:x['hotel']):
    print(key,len(list(group)))
    amount_hotels_trip.append(key)
print('There are reviews for {} hotels.'.format(len(amount_hotels_trip)))

377 House - Amsterdam 193
A-Train Hotel 578
Aalborg Hotel Amsterdam 275
Ambassade Hotel 700
Amstel Botel 686
Amsterdam Downtown Hotel 105
Amsterdam Hostel Centre 91
Amsterdam Hotel Parklane 55
Amsterdam House Hotel 226
Amsterdam Marriott Hotel 108
Amsterdam Teleport Hotel 430
Amsterdam Tropen Hotel 430
Amsterdam Wiechmann Hotel 582
Anco Hotel 145
Andaz Amsterdam Prinsengracht 579
Anna Youth Hostel 4
Annemarie Hotel 134
Apollo Hotel Amsterdam, a Tribute portfolio 668
Apollo Museumhotel Amsterdam City Centre 700
Apollofirst boutique hotel Amsterdam 184
Armada Hotel 152
Art'otel Amsterdam 482
Aston City Hotel 128
Atlanta Hotel 311
Avenue Hotel 688
BackStage Hotel Amsterdam 284
Banks Mansion 700
Bastion Hotel Amsterdam Amstel 279
Bastion Hotel Amsterdam Noord 198
Bastion Hotel Amsterdam Zuidwest 160
BeByme 5
Belfort Hotel 206
Bema Rentals 99
Best Western Delphi Hotel 257
Bilderberg Garden Hotel 694
Bilderberg Hotel Jan Luyken 693
Botel 263
Botel Zebra 46
Camp Inn Hotel 47
Campanile Amsterd

delete hotels with 0 reviews:

In [550]:
correcthotels_tripadvisor[0].keys()

dict_keys(['ranking', 'overall_rating', 'reviews', 'link', 'name', 'doctype', 'META', 'reviewquantity'])

In [553]:
correcthotels_trip = []
for hotel in correcthotels_tripadvisor:
    if len(hotel['reviews']) != 0:
        correcthotels_trip.append(hotel)

In [556]:
len(correcthotels_trip)

279

### 3.2 Expedia

#### 3.2.1 Check for duplicates.

In [491]:
reviews_exp = []
for hotel in allhotels_expedia:
    reviews_exp.append(len(hotel['reviews']))
print("There were {} reviews scraped from {} hotels.".format(sum(reviews_exp),len(allhotels_expedia)))

There were 1604018 reviews scraped from 267 hotels


In [242]:
allhotel_names_exp = []
for hotel in allhotels_expedia:
    allhotel_names_exp.append(hotel['name'])
Counter(allhotel_names_exp)

Counter({'A Train Hotel': 3,
         'Acostar Hotel': 2,
         'Allure Garden Apartments': 1,
         'Alp Hotel Amsterdam': 1,
         'Amadi Panorama Hotel': 1,
         'Amadi Park Hotel': 1,
         'Amsterdam Canal Residence': 1,
         'Amsterdam Downtown Hotel': 2,
         'Amsterdam Forest Hotel': 1,
         'Amsterdam Harbour Apartments': 1,
         'Amsterdam Teleport Hotel': 1,
         'Apple Inn Hotel': 1,
         'Armada Hotel': 2,
         'Arty House': 1,
         'Atlantis Hotel Amsterdam': 1,
         'BackStage Hotel Amsterdam': 1,
         'Banks Mansion': 1,
         'Bastion Hotel Zaandam': 2,
         'Best Western Delphi Hotel': 1,
         'Best Western Plus Amedia Amsterdam Airport': 6,
         'Best Western Plus Amsterdam Airport Hotel': 1,
         'Best Western Zaan Inn': 2,
         'Bilderberg Garden Hotel': 1,
         'Blue Mansion Hotel': 1,
         'Boutique Hotel La Belle Vue': 2,
         'Boutique Hotel Maxime': 1,
         'Breitner

There are duplicates in the dataset, these have to be removed. How many duplicates are there?

In [252]:
unique_hotels_exp = len(Counter(allhotel_names_exp))
seen_exp = set()
uniq_exp = []
not_uniq_exp = []
for hotel in allhotels_expedia:
    if hotel['name'] not in seen_exp:
        uniq_exp.append(hotel['name'])
        seen_exp.add(hotel['name'])
    else:
        not_uniq_exp.append(hotel['name'])
print("There are {} unique hotels, which should be the same as all hotels calculated by Counter: {}. \nThere are {} hotels in the dataset.".format(len(uniq_exp),(unique_hotels_exp==len(uniq_exp)),len(allhotels_expedia)))

There are 203 unique hotels, which should be the same as all hotels calculated by Counter: True. 
There are 267 hotels in the dataset.


In [258]:
shouldbechecked_exp = []
shouldnotbechecked_exp = []
correcthotels_expedia = []
checkinghotels_expedia = []
for hotel in allhotels_expedia:
    if hotel['name'] in not_uniq_exp:
        shouldbechecked_exp.append(hotel['name'])
        checkinghotels_expedia.append(hotel)
    else:
        shouldnotbechecked_exp.append(hotel['name'])
        correcthotels_expedia.append(hotel)
print('There are {} hotels that are duplicates and should be checked.\nThere are {} hotels that do not have duplicates and are added to the list of correct hotels.'.format(len(shouldbechecked_exp),len(correcthotels_expedia)))

There are 114 hotels that are duplicates and should be checked.
There are 153 hotels that do not have duplicates and are added to the list of correct hotels.


For all duplicate hotels, check whether they have the same amount of reviews:

In [265]:
checkinghotels_expedia[50].keys()

dict_keys(['stars', 'rating', 'name', 'url', 'review_quantity', 'doctype', 'subratings', 'reviews', 'id', 'META'])

In [268]:
for hotel in checkinghotels_expedia:
    print(hotel['name'],len(hotel['reviews']),hotel['META']['ADDED'])

A Train Hotel 7059 2018-05-18 11:59:10.348924
A Train Hotel 7059 2018-05-18 18:33:27.788155
A Train Hotel 7059 2018-05-18 15:10:58.173501
Acostar Hotel 2978 2018-05-18 11:04:49.536528
Acostar Hotel 2978 2018-05-18 03:21:53.338373
Amsterdam Downtown Hotel 559 2018-05-16 14:18:50.036431
Amsterdam Downtown Hotel 559 2018-05-18 12:50:04.662882
Armada Hotel 778 2018-05-18 11:06:27.281744
Armada Hotel 778 2018-05-18 03:57:31.379753
Bastion Hotel Zaandam 1358 2018-05-16 11:02:01.107107
Bastion Hotel Zaandam 1399 2018-05-18 11:32:10.252994
Best Western Plus Amedia Amsterdam Airport 1096 2018-05-16 10:35:41.346488
Best Western Plus Amedia Amsterdam Airport 1096 2018-05-18 11:10:07.579635
Best Western Plus Amedia Amsterdam Airport 1096 2018-05-18 14:39:11.038111
Best Western Plus Amedia Amsterdam Airport 1096 2018-05-16 10:47:37.070256
Best Western Plus Amedia Amsterdam Airport 1096 2018-05-18 11:29:07.632480
Best Western Plus Amedia Amsterdam Airport 1096 2018-05-18 04:33:08.413388
Best Western

Sort based on the amount of reviews. CHECK THIS!!!

In [271]:
checkinghotels_expedia.sort(key=lambda item:len(item['reviews']), reverse=True)

In [273]:
for hotel in checkinghotels_expedia:
    print(hotel['name'],len(hotel['reviews']),hotel['META']['ADDED'])

Hilton Amsterdam 29087 2018-05-18 16:50:34.537795
Hilton Amsterdam 29087 2018-05-18 13:52:09.569303
Hilton Amsterdam 29087 2018-05-18 03:10:53.512832
Hyatt Place Amsterdam Airport 27823 2018-05-16 12:54:29.228247
Hyatt Place Amsterdam Airport 27718 2018-05-18 13:37:04.096368
Rho Hotel 22566 2018-05-18 11:40:20.606766
Hotel de Paris Amsterdam 16457 2018-05-16 13:37:19.830546
Hotel de Paris Amsterdam 16405 2018-05-18 12:20:27.295521
The Toren 15892 2018-05-18 00:53:06.264752
The Toren 15892 2018-05-18 14:27:23.586565
The Student Hotel Amsterdam West 15799 2018-05-16 14:07:02.967900
The Student Hotel Amsterdam West 15768 2018-05-18 12:12:43.929780
The Student Hotel Amsterdam West 15768 2018-05-18 02:24:54.891604
Hotel V Frederiksplein 13594 2018-05-16 13:44:39.584424
Hotel V Frederiksplein 13548 2018-05-18 00:20:47.830479
Hotel Sebastian's 13277 2018-05-18 11:27:12.919434
Hotel Sebastian's 13277 2018-05-18 17:35:40.567518
Conscious Hotel Museum Square 12801 2018-05-18 15:33:59.111220
Cons

In [274]:
seen_exp = []
included_hotels_exp = []
deleted_hotels_exp = []
for hotel in checkinghotels_expedia:
    if hotel['name'] not in seen_exp:
        seen_exp.append(hotel['name'])
        included_hotels_exp.append(hotel)
    else:
        deleted_hotels_exp.append(hotel)
print("From the {} duplicate hotels, {} hotels are unique.".format(len(checkinghotels_expedia), len(included_hotels_exp)))

From the 114 duplicate hotels, 50 hotels are unique.


In [275]:
correcthotels_expedia.extend(included_hotels_exp)

In [276]:
print('The final list contains {} unique hotels.'.format(len(correcthotels_expedia)))

The final list contains 203 unique hotels.


#### 3.2.2 Put all reviews from the hotels in one dataset.

In [278]:
allreviews_expedia = []
amount_reviews_hotel = []
for hotel in correcthotels_expedia:
    amount_reviews_hotel.append(len(hotel['reviews']))
    reviews = hotel['reviews']
    for review in reviews:
        review['hotel'] = hotel['name'] # add hotel name to be able to retrieve all reviews for a hotel
    allreviews_expedia.extend(reviews)
print("There should be",sum(amount_reviews_hotel),"reviews, which is:",(len(allreviews_expedia)==sum(amount_reviews_hotel)))

There should be 1266918 reviews, which is: True


Remove empty reviews:

In [494]:
emptystring = []
none = []
for review in allreviews_expedia:
    if review['review'] == "":
        emptystring.append(review)
    if review['review'] is None:
        none.append(review)
print('There are {} none type reviews, {} empty strings'.format(len(none),len(emptystring)))

There are 39347 none type reviews, 443 empty strings


In [495]:
allreviews_expedia[0].keys()

dict_keys(['upvotes', 'username', 'review', 'hotel', 'rating', 'date', 'country', 'title'])

In [497]:
print("There were {} reviews".format(len(allreviews_expedia)))
allreviews_expedia[:] = [review for review in allreviews_expedia if review.get('review') != '']
print("There are now {} reviews.".format(len(allreviews_expedia)))

There were 1266918 reviews
There are now 1266475 reviews.


In [498]:
print("There were {} reviews".format(len(allreviews_expedia)))
allreviews_expedia[:] = [review for review in allreviews_expedia if review.get('review') is not None]
print("There are now {} reviews.".format(len(allreviews_expedia)))

There were 1266475 reviews
There are now 1227128 reviews.


Is there bias in the empty and None reviews?

In [409]:
sorted_emptyreviews = sorted(emptystring, key=itemgetter('hotel'))
for key, group in itertools.groupby(sorted_emptyreviews, key=lambda x:x['hotel']):
    print(key,len(list(group)))

Bilderberg Garden Hotel 63
Hotel De Hallen 31
Hotel Nicolaas Witsen 19
Hotel Vondel 61
Hotel Zwanenburg 91
NH Amsterdam Schiphol Airport 61
Qbic Hotel Amsterdam WTC 71
Quentin Arrive 46


In [506]:
high_empty = []
for hotel in correcthotels_expedia:
    amount_all = []
    amount_empty = []
    amount_normal = []
    for review in hotel['reviews']:
        amount_all.append(review)
        if review['review'] == '':
            amount_empty.append(review)
        else:
            amount_normal.append(review)
    if len(amount_empty) != 0:
        percentage_empty = round(len(amount_empty)/len(amount_all)*100)
        if percentage_empty > 5:
            thishotel = {}
            thishotel['name'] = hotel['name']
            thishotel['percentage'] = percentage_empty
            high_empty.append(thishotel)
            print("{} has {}/{} empty reviews and {}/{} not-empty reviews, which is {}%.".format(hotel['name'],len(amount_empty),len(amount_all),len(amount_normal),len(amount_all),percentage_empty))

In [507]:
len(high_empty)

0

In [410]:
sorted_nonereviews = sorted(none, key=itemgetter('hotel'))
for key, group in itertools.groupby(sorted_nonereviews, key=lambda x:x['hotel']):
    print(key,len(list(group)))

A Train Hotel 76
Alp Hotel Amsterdam 96
Amadi Panorama Hotel 29
Amadi Park Hotel 71
Amsterdam Downtown Hotel 17
Amsterdam Forest Hotel 59
Amsterdam Harbour Apartments 4
Amsterdam Teleport Hotel 201
Apple Inn Hotel 206
Armada Hotel 31
Atlantis Hotel Amsterdam 44
Banks Mansion 330
Bastion Hotel Zaandam 61
Best Western Delphi Hotel 35
Best Western Plus Amedia Amsterdam Airport 126
Best Western Plus Amsterdam Airport Hotel 825
Best Western Zaan Inn 213
Bilderberg Garden Hotel 633
Blue Mansion Hotel 5
Boutique Hotel La Belle Vue 160
Canal Boutique Rooms & Apartments 83
Canal House 20
City Hotel Amsterdam 23
Cityden Museum Square Hotel Apartments 3
Cityden Up Amsterdam South Hotel Apartments 23
Conscious Hotel Museum Square 621
Conservatorium Hotel 69
Courtyard by Marriott Amsterdam Airport 135
Damrak Inn 119
De Baronie B&B 3
De Jonker Urban Studio's & Suites 1
De L'Europe Amsterdam 72
De Rustende Jager 26
Delta Hotel City Center 92
Die Port van Cleve Hotel 555
Fletcher Hotel Amsterdam 673
F

In [508]:
high_none = []
for hotel in correcthotels_expedia:
    amount_all = []
    amount_none = []
    amount_normal = []
    for review in hotel['reviews']:
        amount_all.append(review)
        if review['review'] is None:
            amount_none.append(review)
        else:
            amount_normal.append(review)
    if len(amount_none) != 0:
        percentage_none = round(len(amount_none)/len(amount_all)*100)
        if percentage_none > 5:
            thishotel = {}
            thishotel['name'] = hotel['name']
            thishotel['percentage'] = percentage_none
            high_none.append(thishotel)
            print("{} has {}/{} none-type reviews and {}/{} normal reviews, which is {}%.".format(hotel['name'],len(amount_none),len(amount_all),len(amount_normal),len(amount_all),percentage_none))

Amadi Panorama Hotel has 29/524 none-type reviews and 495/524 normal reviews, which is 6%.
Amsterdam Forest Hotel has 59/969 none-type reviews and 910/969 normal reviews, which is 6%.
Damrak Inn has 119/2107 none-type reviews and 1988/2107 normal reviews, which is 6%.
De Jonker Urban Studio's & Suites has 1/5 none-type reviews and 4/5 normal reviews, which is 20%.
Fletcher Hotel Amsterdam has 673/7927 none-type reviews and 7254/7927 normal reviews, which is 8%.
Het Hart van Weesp has 60/678 none-type reviews and 618/678 normal reviews, which is 9%.
Holiday Inn Amsterdam has 1204/12223 none-type reviews and 11019/12223 normal reviews, which is 10%.
Holiday Inn Express Amsterdam - City Hall has 1/1 none-type reviews and 0/1 normal reviews, which is 100%.
Hotel Ajax has 65/606 none-type reviews and 541/606 normal reviews, which is 11%.
Hotel Campanile Amsterdam Zuidoost has 188/2382 none-type reviews and 2194/2382 normal reviews, which is 8%.
Hotel City Garden Amsterdam has 1625/19937 non

In [509]:
len(high_none)

36

Check for duplicates:

In [499]:
seen_review = []
included_reviews_exp = []
deleted_reviews_exp = []
for review in allreviews_expedia:
    if review['review'] not in seen_review:
        seen_review.append(review['review'])
        included_reviews_exp.append(review)
    else:
        deleted_reviews_exp.append(review)
print("From the {} Expedia reviews, {} reviews are unique.".format(len(allreviews_expedia), len(included_reviews_exp)))

From the 1227128 Expedia reviews, 32093 reviews are unique.


In [510]:
amount_hotels_exp = []
sorted_included_reviews = sorted(included_reviews_exp, key=itemgetter('hotel'))
for key, group in itertools.groupby(sorted_included_reviews, key=lambda x:x['hotel']):
    print(key,len(list(group)))
    amount_hotels_exp.append(key)
print('There are reviews for {} hotels.'.format(len(amount_hotels_exp)))

A Train Hotel 216
Acostar Hotel 158
Allure Garden Apartments 18
Alp Hotel Amsterdam 95
Amadi Panorama Hotel 48
Amadi Park Hotel 122
Amsterdam Canal Residence 8
Amsterdam Downtown Hotel 46
Amsterdam Forest Hotel 73
Amsterdam Harbour Apartments 69
Amsterdam Teleport Hotel 243
Apple Inn Hotel 540
Armada Hotel 69
Atlantis Hotel Amsterdam 63
BackStage Hotel Amsterdam 47
Banks Mansion 743
Bastion Hotel Zaandam 71
Best Western Delphi Hotel 96
Best Western Plus Amedia Amsterdam Airport 80
Best Western Plus Amsterdam Airport Hotel 578
Best Western Zaan Inn 273
Bilderberg Garden Hotel 540
Blue Mansion Hotel 37
Boutique Hotel La Belle Vue 98
Boutique Hotel Maxime 27
Breitner House 31
Canal Boutique Rooms & Apartments 119
Canal House 210
Canal House Suites at Sofitel Legend The Grand Amsterdam 3
City Hotel Amsterdam 129
City Park Apartments 14
Cityden Museum Square Hotel Apartments 59
Cityden Up Amsterdam South Hotel Apartments 100
Conscious Hotel Museum Square 296
Conscious Hotel Westerpark 4
Con

delete hotels with 0 reviews!

In [542]:
correcthotels_exp = []
for hotel in correcthotels_expedia:
    if len(hotel['reviews']) != 0:
        correcthotels_exp.append(hotel)
print('There are {} hotels with reviews'.format(len(correcthotels_exp)))

There are 198 hotels with reviews


## 4. Save cleaned Datasets

### 4.1 TripAdvisor

#### 4.2.1 Save the hotels in a file.

In [557]:
with open('allhotels_tripadvisor.pkl', 'wb') as fo:
    pickle.dump(correcthotels_trip, fo)

#### 4.2.2 Save the reviews in a file.

In [537]:
with open('allreviews_tripadvisor.pkl', 'wb') as fo:
    pickle.dump(included_reviews_trip, fo)

### 4.2 Expedia

#### 4.2.1 Save the hotels in a file.

In [558]:
with open('allhotels_expedia.pkl', 'wb') as fo:
    pickle.dump(correcthotels_exp, fo)

#### 4.2.2 Save the reviews in a file.

In [539]:
with open('allreviews_expedia.pkl', 'wb') as fo:
    pickle.dump(included_reviews_exp, fo)