In [1]:
import pandas as pd
import json
import re

In [2]:
data_train = pd.read_csv('../input/sf-booking/hotels_train.csv')
data_train.head(3)

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,0,14,7.5,"[' Leisure trip ', ' Solo traveler ', ' Modern...",289 day,48.845377,2.325643


In [3]:
data_test = pd.read_csv('../input/sf-booking/hotels_test.csv')
data_test.head(3)

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,tags,days_since_review,lat,lng
0,Via Senigallia 6 20161 Milan Italy,904,7/21/2017,8.1,Hotel Da Vinci,United Kingdom,Would have appreciated a shop in the hotel th...,52,16670,Hotel was great clean friendly staff free bre...,62,1,"[' Leisure trip ', ' Couple ', ' Double Room '...",13 days,45.533137,9.171102
1,Arlandaweg 10 Westpoort 1043 EW Amsterdam Neth...,612,12/12/2016,8.6,Urban Lodge Hotel,Belgium,No tissue paper box was present at the room,10,5018,No Positive,0,7,"[' Leisure trip ', ' Group ', ' Triple Room ',...",234 day,52.385649,4.834443
2,Mallorca 251 Eixample 08008 Barcelona Spain,46,11/26/2015,8.3,Alexandra Barcelona A DoubleTree by Hilton,Sweden,Pillows,3,351,Nice welcoming and service,5,15,"[' Business trip ', ' Solo traveler ', ' Twin ...",616 day,41.393192,2.16152


In [4]:
df_train = data_train.copy()
df_test = data_test.copy()

In [5]:
all_tags = set()

df_all = pd.concat([df_train, df_test], axis=0)

def process_tags_col(tags):
    tags = json.loads(tags.replace("'", '"'))
    for tag in tags:       
        all_tags.add(tag.strip())
        
df_all['tags'].apply(process_tags_col)

all_tags

{'Superior Queen Suite with Sofa Bed 2 Adults',
 'Premium Double or Twin Room with Garden View',
 'Privilege Double Room',
 'Petite Double Room',
 'Standard Double or Twin Room with Balcony 1 Adult',
 'Superior Double Twin Room with Free Parking',
 'Art Deco Room Balcony',
 'Twin Guest Room',
 'Queen Superior Room',
 'Superior room Design Style Free Wifi',
 'Double Room Relax',
 'Garden King Room',
 'Queen or Twin Room',
 'Comfort Double Room',
 'Executive King Room with Sofa Bed Non Smoking',
 'Corner Double Room',
 'Standard Queen or Twin Room Lower Ground Floor',
 'Westminster Suite with King Bed',
 'Premium Room Grand Pigalle',
 'Special Offer Double Room Park and Sleep inklusive Breakfast',
 'Executive Double or Twin Room 3 Adults',
 'Couture Junior Suite',
 'Deluxe Balcony Room',
 'Prestige Double or Twin Room',
 'Comfort Twin Double Room',
 'Classic Junior Suite',
 'Privilege Room With View',
 'Superior Double Room with Sofa Bed and Pool View',
 'Double Room with Balcony',
 'And

In [6]:
len(all_tags)

2428

In [7]:
tags_counts = {}

for tag in all_tags:
    tags_counts[tag] = 0
    
def count_tags(tags):
    tags = json.loads(tags.replace("'", '"'))
    for tag in tags:
        tag = tag.strip()
        tags_counts[tag] = tags_counts[tag] + 1
        
df_all['tags'].apply(count_tags)

tags_counts

{'Superior Queen Suite with Sofa Bed 2 Adults': 29,
 'Premium Double or Twin Room with Garden View': 8,
 'Privilege Double Room': 228,
 'Petite Double Room': 14,
 'Standard Double or Twin Room with Balcony 1 Adult': 72,
 'Superior Double Twin Room with Free Parking': 33,
 'Art Deco Room Balcony': 1,
 'Twin Guest Room': 2197,
 'Queen Superior Room': 43,
 'Superior room Design Style Free Wifi': 13,
 'Double Room Relax': 13,
 'Garden King Room': 92,
 'Queen or Twin Room': 70,
 'Comfort Double Room': 2641,
 'Executive King Room with Sofa Bed Non Smoking': 23,
 'Corner Double Room': 34,
 'Standard Queen or Twin Room Lower Ground Floor': 64,
 'Westminster Suite with King Bed': 1,
 'Premium Room Grand Pigalle': 21,
 'Special Offer Double Room Park and Sleep inklusive Breakfast': 3,
 'Executive Double or Twin Room 3 Adults': 6,
 'Couture Junior Suite': 2,
 'Deluxe Balcony Room': 11,
 'Prestige Double or Twin Room': 23,
 'Comfort Twin Double Room': 285,
 'Classic Junior Suite': 25,
 'Privilege 

In [8]:
all_tags_sorted = {k: v for k, v in sorted(tags_counts.items(), key=lambda item: item[1], reverse=True)}
all_tags_sorted

{'Leisure trip': 417778,
 'Submitted from a mobile device': 307640,
 'Couple': 252294,
 'Stayed 1 night': 193645,
 'Stayed 2 nights': 133937,
 'Solo traveler': 108545,
 'Stayed 3 nights': 95821,
 'Business trip': 82939,
 'Group': 65392,
 'Family with young children': 61015,
 'Stayed 4 nights': 47817,
 'Double Room': 35207,
 'Standard Double Room': 32248,
 'Superior Double Room': 31393,
 'Family with older children': 26349,
 'Deluxe Double Room': 24823,
 'Double or Twin Room': 22393,
 'Stayed 5 nights': 20845,
 'Standard Double or Twin Room': 17483,
 'Classic Double Room': 16989,
 'Superior Double or Twin Room': 13570,
 '2 rooms': 12393,
 'Stayed 6 nights': 9776,
 'Standard Twin Room': 9745,
 'Single Room': 9670,
 'Twin Room': 8321,
 'Stayed 7 nights': 7399,
 'Executive Double Room': 6425,
 'Classic Double or Twin Room': 6107,
 'Superior Twin Room': 6064,
 'Deluxe Double or Twin Room': 5998,
 'Club Double Room': 5908,
 'Queen Room': 5472,
 'Deluxe King Room': 5357,
 'Superior Queen Room

In [9]:
tags_patterns = {
    'purpose': [
        r'Leisure trip',
        r'Business trip'
    ],
    'group_size': [
        r'Solo traveler',
        r'Couple',
        r'Group',
        r'Family with young children',
        r'Family with older children'
    ],
    'rooms_count': [
        r'(Single Room)|(One Bedroom)',
        r'(Double Room)|(Twin Room)|(2 rooms)|(Twin Guest Room)|(Twin Hilton Guest Room)|(Double Hilton Guestroom)',
        r'(Triple Room)|(3 rooms)',
        r'Quadruple Room'
    ],
    'rooms_quality': [
        r'(Basic)|(Small)|(Compact)',
        r'(Economy)|(Standard)|(Classic)',
        r'(Club)|(Comfort)',
        r'(Superior)|(Executive)|(Suite)|(Junior Suite)',
        r'(Deluxe)|(Luxury)|(Premier)'
    ],
    'from_mobile': [
        r'Submitted from a mobile device'
    ],
    'options': [
        r'Cosy',
        r'(without Window)|(No Window)',
        r'With a pet',
        r'Non Smoking',
        r'(with Landmark View)|(with Winter Garden View)|(with Mountain View)|(with canal view)|(with Water View)|(with Garden View)|(with Iconic View)|(with View)|(with Internal View)|(City View)|(Stadium View)|(Bridge View)|(Street View)|(Courtyard View)|(with Sea View)|(with River View)|(with Park View)|(Eiffel Tower View)|(with State Opera View)|(with View of Big Ben)',
        r'(Spa Bath)|(Spa Access)|(with Spa Fitness Access)|(\sSpa\s)|(Pool)',
        r'(Balcony)|(Terrace)',
        r'(Sofa Bed)|(with Extra Bed)|(Double Bed)|(Two Double Beds)|(with Two Queen Beds)',
        r'(Queen)|(King)|(Royal)'
    ]
}

In [10]:
df = data_train.copy()

In [11]:
df['purpose'] = 0
df['group_size'] = 0
df['rooms_count'] = 0
df['rooms_quality'] = 0
df['from_mobile'] = 0
df['options_1'] = 0
df['options_2'] = 0
df['options_3'] = 0
df['options_4'] = 0
df['options_5'] = 0
df['options_6'] = 0
df['options_7'] = 0
df['options_8'] = 0
df['options_9'] = 0

In [12]:
def process_tags_feature(tags, f_name):
    global tags_patterns

    tags = json.loads(tags.replace("'", '"'))
    for tag in tags:
        cnt = 1
        for pattern in tags_patterns[f_name]:
            match = re.search(pattern, tag)
            if match is not None:
                return cnt
            cnt = cnt + 1
    
    return 0

In [13]:
def process_tags_options(tags, o_index):
    global tags_patterns

    tags = json.loads(tags.replace("'", '"'))
    for tag in tags:
        pattern = tags_patterns['options'][o_index-1]
        match = re.search(pattern, tag)
        if match is not None:
            return 1
    
    return 0

In [14]:
df['purpose'] = df['tags'].apply(lambda x: process_tags_feature(x, 'purpose'))
df['group_size'] = df['tags'].apply(lambda x: process_tags_feature(x, 'group_size'))
df['rooms_count'] = df['tags'].apply(lambda x: process_tags_feature(x, 'rooms_count'))
df['rooms_quality'] = df['tags'].apply(lambda x: process_tags_feature(x, 'rooms_quality'))
df['from_mobile'] = df['tags'].apply(lambda x: process_tags_feature(x, 'from_mobile'))

In [15]:
df['options_1'] = df['tags'].apply(lambda x: process_tags_options(x, 1))
df['options_2'] = df['tags'].apply(lambda x: process_tags_options(x, 2))
df['options_3'] = df['tags'].apply(lambda x: process_tags_options(x, 3))
df['options_4'] = df['tags'].apply(lambda x: process_tags_options(x, 4))
df['options_5'] = df['tags'].apply(lambda x: process_tags_options(x, 5))
df['options_6'] = df['tags'].apply(lambda x: process_tags_options(x, 6))
df['options_7'] = df['tags'].apply(lambda x: process_tags_options(x, 7))
df['options_8'] = df['tags'].apply(lambda x: process_tags_options(x, 8))
df['options_9'] = df['tags'].apply(lambda x: process_tags_options(x, 9))

In [16]:
df = pd.get_dummies(df, columns=['purpose', 'group_size', 'rooms_count', 'rooms_quality'])

In [17]:
df.head()

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,...,rooms_count_1,rooms_count_2,rooms_count_3,rooms_count_4,rooms_quality_0,rooms_quality_1,rooms_quality_2,rooms_quality_3,rooms_quality_4,rooms_quality_5
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,...,0,0,0,0,0,0,0,0,1,0
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,...,0,1,0,0,0,0,1,0,0,0
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,...,0,1,0,0,1,0,0,0,0,0
3,216 Avenue Jean Jaures 19th arr 75019 Paris Fr...,34,9/22/2015,7.5,Mercure Paris 19 Philharmonie La Villette,United Kingdom,No Negative,0,607,Friendly staff quiet comfortable room spotles...,...,0,0,0,0,0,0,1,0,0,0
4,Molenwerf 1 1014 AG Amsterdam Netherlands,914,3/5/2016,8.5,Golden Tulip Amsterdam West,Poland,Torn sheets,4,7586,The staff was very friendly and helpful Break...,...,0,1,0,0,0,0,1,0,0,0
