In [1]:
#coding: utf-8
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import datetime
from tqdm import tqdm#_notebook as tqdm
import multiprocessing
import gzip

In [2]:
def getterm(term):
    r = requests.get('https://www.hotels.com/siteindex_www_hotels_com.xml')
    todf = []
    citylinks = [x.text for x in BeautifulSoup(r.content,'lxml').find_all('loc') if '/%s_www' % term in x.text]
    for city in tqdm(citylinks):
        r2 = requests.get(city)
        soup = BeautifulSoup(gzip.decompress(r2.content),'lxml')
        todf.append(pd.DataFrame([{'name':x.find('loc').text.split('/')[-2],
                                   'id':x.find('loc').text.split('/')[-3][2:],
                                   'url':x.find('loc').text
                                  } for x in soup.find_all('url')])
        )
    return pd.concat(todf)

In [3]:
def recmerge(df1,df2,collist,targetcol):
    out = []
    for col in collist:
        if len(out) < 1:
            out.append(df1.merge(df2[[col,targetcol]],on=col,how='left'))
        else:
            out.append(out[-1][out[-1][targetcol].isnull()].drop(targetcol,axis=1
                       ).merge(df2[[col,targetcol]],on=col,how='left'))
    return pd.concat([x[~x['id'].isnull()] for x in out])

In [4]:
fullhotel = getterm('HOTEL')

100%|██████████| 21/21 [01:56<00:00,  4.93s/it]


In [12]:
def process_hotel(hotelid):
    hotelid = str(hotelid)
    result = {'hotelid':hotelid}
    soup = BeautifulSoup(requests.get('https://www.hotels.com/ho' + hotelid,
                                     params={'locale':'en_IE'}).content,'html5lib')
    for vcard in soup.find_all('div',{'class':'property-description'}):
        for fun, name in [[lambda x: x.find('h1').text,'name'],
                         [lambda x: x.find('span',{'class':'hotel-coordinates'}
                                          ).find_all('meta')[0]['content'],'lat'],
                         [lambda x: x.find('span',{'class':'hotel-coordinates'}
                                          ).find_all('meta')[1]['content'],'lon'],
                         [lambda x: x.find('span',{'class','star-rating-text'}).text,'star'],
                         [lambda x: x.find('span',{'class','street-address'}).text,'street'],
                         [lambda x: x.find('span',{'class','locality'}).text,'locality'],
                         [lambda x: x.find('span',{'class','region'}).text,'region'],
                         [lambda x: x.find('span',{'class','postal-addr'}).text,'addr'],
                         [lambda x: x.find('span',{'class','postal-code'}).text,'zip'],
                         [lambda x: x.find('span',{'class','country-name'}).text,'country'],
                         [lambda x: x.find('div',{'class','tagline'}).text,'tagline']]:
            try:
                result[name] = fun(vcard)
            except:
                #print('FUCKED ----------- %s --- %s' % (name,hotelid))
                pass
    try:
        result['dest-id'] = soup.find('span',{'class':'back-link'}).find('a')['href'].split(
            'destination-id=')[-1].split('&')[-1]
    except:
        #print('FUCKED ----------- %s --- %s' % ('dest-id',hotelid))
        pass
    for findict,name in [[{'data-overview-section-type':"HOTEL_FEATURE"},'base'],
                        [{'data-overview-section-type':"FAMILY_FRIENDLY_SECTION"},'family'],
                        [{'class':'key-facts-container'},'key'],
                        [{'class':'travelling-container'},'travel'],
                        [{'class':'transport-container'},'transport'],
                        [{'class':'badges-and-services-container'},'badges']]:
        baselist = soup.find('div',findict)
        try:
            lis = baselist.find_all('li')
        except:
            #print('FUCKED ----------- %s --- %s' % (name,hotelid))
            lis = []
            result[name + '_missing'] = 1
        for li in lis:
            num_found = False
            for reg,num_name in [['This hotel has (\d+)','rooms'],
                        ['This hotel is arranged over (\d+)','floors'],
                        ['(\d+) restaurants','restaurants'],
                        ['(\d+) outdoor pools','outdoor pools'],
                        ['(\d+) spa tubs','spa tubs'],
                        ['(\d+) bars/lounges','bars/lounges'],
                        ['(\d+) poolside bars','poolside bars'],
                        ['(\d+) poolside bars','poolside bars'],
                        ['(\d) smoke-free guestrooms','smoke-free guestrooms'],
                        ['(\d) guestrooms','guestrooms'],
                        ['(\d) apartments','apartments'],
                        ['(\d) smoke-free apartments','smoke-free apartments'],
                        ['(\d) inch flat-screen TV','inch flat-screen TV'],
                        ['(\d) inch LCD TV','inch LCD TV'],
                        ['(\d) inch TV','inch TV'],
                        ['(\d) Smart TV','Smart TV'],
                        ['(\d) LED TV','LED TV'],
                        ['(\d) inch plasma TV','inch plasma TV'],
                        ['(\d) villas','villas'],
                        ['(\d) smoke-free accommodations','smoke-free accommodations']]:
                ref = re.findall(reg,li.text)
                if len(ref) > 0:
                    result[num_name] = ref[0]
                    num_found = True
            if num_found:
                continue
            if name == 'key':
                if 'Check-in time' in li.text:
                    result['Check-in time'] = li.text.split('time ')[-1]
                    continue
                if 'Check-out time' in li.text:
                    result['Check-out time'] = li.text.split('time ')[-1]
                    continue
            result[name + '_' + li.text] = 1


    for findict,name in [[{'class':'fact-sheets in-the-property-module'},'inhotel'],
                         [{'class':'fact-sheets in-the-room-module'},'inroom']]:    
        inhotel = soup.find('div',findict)
        try:
            lis = inhotel.find_all('div',{'class':'fact-sheet-table-row'})
        except:
            #print('FUCKED ----------- %s --- %s' % (name,hotelid))
            result[name + '_missing'] = 1
            lis = []
        for li in lis:
            head = li.find('div',{'class':'fact-sheet-table-header'})
            try:
                head = head.text
                cells = li.find_all('li')
            except:
                #print('FUCKED ----------- %s --- %s' % (name,hotelid))
                cells = []
            for item in cells:
                result[name + '_' + head + '_' + item.text] = 1
    return result

#@ray.remote
#def ray_hotel_list(hlist):
#    return pd.DataFrame([process_hotel(h) for h in hlist])

def mp_hotel_list(hlist):
    return pd.DataFrame([process_hotel(h) for h in tqdm(hlist)])

In [14]:
splitnum = multiprocessing.cpu_count() * 4
#hotel_atts = pd.concat(ray.get([ray_hotel_list.remote(hlist) for
#                        hlist in np.array_split(
#                        fullhotel['id'].sample(100).values,splitnum)])).set_index('hotelid')

pool = multiprocessing.Pool(splitnum)

hotel_atts = pd.concat(pool.map(mp_hotel_list,[hlist for
                        hlist in np.array_split(
                        fullhotel['id'].sample(1000).values,splitnum)])).set_index('hotelid')

100%|██████████| 62/62 [01:20<00:00,  1.07s/it]
100%|██████████| 62/62 [01:21<00:00,  1.23s/it]
100%|██████████| 63/63 [01:21<00:00,  1.19s/it]
100%|██████████| 62/62 [01:21<00:00,  1.21s/it]
100%|██████████| 62/62 [01:23<00:00,  1.21s/it]
100%|██████████| 62/62 [01:23<00:00,  1.16s/it]
100%|██████████| 63/63 [01:23<00:00,  1.11s/it]
100%|██████████| 62/62 [01:23<00:00,  1.20s/it]
100%|██████████| 63/63 [01:24<00:00,  1.00s/it]
100%|██████████| 63/63 [01:24<00:00,  1.09it/s]
100%|██████████| 63/63 [01:25<00:00,  1.01it/s]
 92%|█████████▏| 57/62 [01:25<00:04,  1.02it/s]
100%|██████████| 63/63 [01:25<00:00,  1.01s/it]
100%|██████████| 62/62 [01:27<00:00,  1.05s/it]
100%|██████████| 63/63 [01:28<00:00,  1.05it/s]
100%|██████████| 62/62 [01:30<00:00,  1.03it/s]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


In [20]:
hotel_atts.to_csv('../data/external/hotel-atts.csv',index='hotelid')

In [17]:
hotel_atts['region'].value_counts()

England,                     51
FL,                          30
CA,                          25
Scotland,                    11
NSW,                         11
QLD,                         11
CO,                          10
NC,                           8
TX,                           8
NY,                           8
AL,                           8
Bali,                         8
TN,                           7
VIC,                          7
OR,                           6
MN,                           6
SC,                           6
Florida,                      6
PA,                           6
HI,                           5
BC,                           5
ON,                           5
Western Cape,                 5
NA,                           5
QROO,                         5
Shaanxi,                      4
Surat Thani,                  4
WA,                           4
AZ,                           4
Zhejiang,                     4
                             ..
Santorin

In [469]:
testdf2.to_csv('../data/external/hotel-atts.csv')

In [370]:
hotelcols = testdf.columns

In [461]:
doubles = pd.Series(hotelcols).apply(lambda x: re.findall('(.*?)\d+(.*)',x))
doubles = doubles[doubles.str.len() > 0]
doubles = doubles.apply(lambda x: pd.Series(x[0]).apply(lambda x: x.strip()))#.drop_duplicates()
doubles.apply(lambda x: '-'.join(x),axis=1).value_counts().reset_index()

0                            base_-smoke-free guestrooms
1                                       base_-guestrooms
2                                       base_-apartments
3                            base_-smoke-free apartments
4            inroom_Be entertained_--inch flat-screen TV
5      travel_Children (-years old and younger) stay ...
6      travel_One child (-years old and younger) stay...
7                    inroom_Be entertained_--inch LCD TV
8                           key_Minimum check-in age is-
9      travel_Children (-years old and younger) not a...
10                       inroom_Be entertained_--inch TV
11                   inhotel_Food and drink_-restaurants
12                 inroom_Be entertained_--inch Smart TV
13                   inroom_Be entertained_--inch LED TV
14                                     base_-restaurants
15                        key_Check-in time starts at-PM
16                             key_Check-in time-PM-9 PM
17                             

In [420]:
testdf[~testdf['key_Monday - Sunday: 8 AM - 9 PM'].isnull()]['hotelid']

60    428319
Name: hotelid, dtype: object

In [352]:
rdate = requests.get('https://www.hotels.com/ho281147/?q-check-out=2018-10-17&tab=description&q-room-0-adults=2&YGF=3&q-check-in=2018-10-16&MGT=1&WOE=3&WOD=2&ZSX=0&SYE=3&q-room-0-children=0&locale=en_IE')

In [353]:
print(BeautifulSoup(rdate.content,'html5lib').find('div',{'class':'room-details resp-module'}).parent.parent.parent.prettify())#.find('form').find_all('input')

AttributeError: 'NoneType' object has no attribute 'parent'

In [327]:
fullcity = getterm('CITY')




In [329]:
fullcity['place'] = fullcity['name'].apply(lambda x: '-'.join(x.split('-')[1:]))
fullcity['place-1'] = fullcity['place'].apply(lambda x: x.split('-')[0])
fullcity['place-2'] = fullcity['place'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [None]:
c = requests.get('https://www.boldtuesday.com/pages/alphabetical-list-of-all-countries-and-capitals-shown-on-list-of-countries-poster').content
capitals = pd.read_html(c,header=0)[0]
c2 = requests.get('https://en.wikipedia.org/wiki/List_of_largest_cities').content
biguns = pd.read_html(c2,header=0)[1]
c3 = requests.get('https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population').content
bigus = pd.read_html(c3,header=0)[4]
c4 = requests.get('https://en.wikipedia.org/wiki/List_of_cities_in_the_European_Union_by_population_within_city_limits').content
bigeu = pd.read_html(c4,header=0)[0]
citydf = pd.concat([capitals.rename({'COUNTRY':'Nation','CAPITAL':'City'},axis='columns'),
           bigeu.rename({'Country':'Nation'},axis='columns')[['City','Nation']],
           pd.DataFrame({'City':bigus.iloc[:,[1,2]].apply(lambda x: ', '.join(x),axis=1),'Nation':'United States'}),
           biguns[['City','Nation']]]).dropna().apply(lambda x: x.apply(lambda y: y.title()))
citydf['City'] = citydf['City'].apply(lambda x: re.sub('\[.*?\]','',x))
citydf = citydf.drop_duplicates('City')

In [332]:
citydf['place'] = citydf.apply(lambda x: '-'.join(
    x[['City','Nation']]).lower().replace(' ','-'
                                 ).replace(',',''
                                 ).replace('-united-states',''
                                 ).replace("'",'-'
                                 ).replace('á','a')
                               ,axis=1)
citydf = citydf.drop_duplicates('place')
citydf['place-1'] = citydf.apply(lambda x: x['City'].lower(
                                 ).replace(' ','-'
                                 ).replace(',',''
                                 ).replace("'",''
                                 ).replace('á','a')
                               ,axis=1)
citydf['place-2'] = citydf['place-1']

merged_city = recmerge(citydf,fullcity,['place','place-1','place-2'],'id')

In [None]:
HEADERS = {'Accept-Language':'en-US,en;q=0.5',
           'Cookie':'SSLB=1; SSID=CABKoh3gAFAAAAAod3dZgmrACCh3d1kLAAAAAADhxdVatXboWQCSjOoMAAWdiQAAtXboWQEAGQ0ABzaLAABzJK1ZCgANDQAFi4oAALV26FkBACgNAAf3iwAAtXboWQEAGg0AAzmLAABzJK1ZCgDhDAAF54gAALV26FkBAMQMAAVfhwAAtXboWQEA5AsABXB8AAC1duhZAQC1DAAF5YUAALV26FkBAC4NAAVxjAAAtXboWQEALQ0ABW6MAAC1duhZAQAuDAAFFIEAALV26FkBAM0MAAWbhwAAtXboWQEAAw0AB0mKAABzJK1ZCgD9DAAFEYoAALV26FkBAAwNAAGFigAAtXboWQEAPAwAANIMAAAeDAAAwwwAAKsKAABXDAAAigwAABENAAC7DAAAwQwAALgMAAAjDQAAuQwAANcKAAAkDQAAOgwAAO8MAAAiDQAA8wsAAAQKAAA; SSRT=h4DoWQIDAQ; SSPV=fFoAAAAAACABBgAAAAAAAAAAAAQAAAAAAAA; user=QSplbl9JRXxIQ09NX0VNRUE.; mvthistory=eJxNlUuuHDEIRXdUMj9jPIuiSJk8ZQkZZw9ZfHBxcWfUxxjzpzrWI894%2FjBvkpXEm%2Bd46Mh0s3vC2DLGM1%2B1ZLa6ti0yC3mLn%2FsjnVvDHj5MtG2M14JZvnoVfHsM3Mder9Ox14z69dKPoWWabIfaK6MxAkDaAFeRPFdHQzS9wqFUosB7DoIGnwOXCtM5CPTXOejnRgZcKV2AOWV5IuBdBbHpVEiPbxtVwYRVkcph6BpVtuQodELVgxbDy3Kqh354ljC0gMeKNstEVEJy9CWzYVrdjVRhKs%2FMs7JhYa7rbPztpxy%2B1WRRvJI5AEtgdPPJhjEvltaogmVDS9mWAUIbEP4cXPOXbeYp3fNzY4JD8kRizuszmgzz7nIj9TbsUQXlhQHkJYgiUH0OzFq2mBsUYN5VkZF5OMZBxt2ItQCx7j6QSAnP4twlGeU4IQD0eaKCMcz25AHvFcGKooiiTl2PZNwae4WW5RMzZOGCGFd1%2B32zGKajenhqmHySyNvotKIWYesYZU0J454QAOxsLoJ255Xm7ZeSj54jpWg7aI0yOqKsDrCW2JXABU%2Fv5qoMbMI8DLNZoY%2FCbOHqYqX07GsNDZ3D%2FP8A4%2FqqBYKOPCgiUkP22raNZhc12f%2FjVSyH4yNnugGacM9vst%2BCmcSHleBIsQ7zMIpnzre05gph13jia6OTUCpLZu1PRLL1XOucaLWP1YAOu9iN2adeXhgTDWnowAI7qrHKbH75F4DxH8BVw78%2Fv%2F%2F6%2Bv3j68e3fyveV0M%3D; guid=8577897d-e0dd-41b6-bb87-e0c7623e1ac6; s_fid=46E32F2FDCBD2CD8-26FE0E1C4D45ACA6; _ga=GA1.2.725440213.1501001517; s_vi=[CS]v1|2CBBBB9685316B18-4000010B400016D2[CE]; AMCV_C00802BE5330A8350A490D4C%40AdobeOrg=-1330315163%7CMCAID%7C2CBBBB9685316B18-4000010B400016D2%7CMCIDTS%7C17459%7CMCMID%7C58816101718994981334228947833632971511%7CMCAAMLH-1509011767%7C6%7CMCAAMB-1509011767%7CTGF0eZrks1DsVMCPfHyQDFnTcTvl83liCN-3ux4fEx6Rm-E%7CMCOPTOUT-1508414167s%7CNONE; 13211=2CBBBB9685316B18-4000010B400016D2; _cc=AdIP8KWCSu4DZRs5hysg6HJa; homepage_search_data=Vmllbm5hLCBBdXN0cmlh%2F%2F15%2F01%2F2018%2F%2F16%2F01%2F2018%2F%2F2%2F%2Fdd%2FMM%2Fyyyy%2F%2F39254%2F%2F; _vz=viz_59ad248236eca; akacd_pr_5=1510826165~rv=74~id=e4754031f85979d8cb7c31e0bfc6ef8b; _gid=GA1.2.1096243265.1508406967; searchReturningUser=true; propertyReturningUser=true'}

APIURL = 'https://www.hotels.com/search/listings.json'

def get_dict(destid,chout,chin):
    i = 1
    out = []
    while True:
        print i
        params = {'destination-id':destid,
                  'q-check-out=':chout,
                  'q-check-in':chin,
                  'q-room-0-adults':2,
                  'q-rooms':1,
                  'q-room-0-children':0,
                  'locale':'en_IE',
                  'pn':i}
        page = requests.get(APIURL,params=params,headers=HEADERS)
        datastring = page.content
        #datastring = script.text.strip().replace('&amp;','&').split(';')[0][4:]
        datastring = datastring.replace('\t','')
        datastring = datastring.replace('true','True')
        datastring = datastring.replace('false','False')
        exec('json = ' + datastring)
        #DO THE D'S
        pno = json['data']['body']['searchResults']['pagination']['currentPage']
        if pno < i:
            break
        i += 1
        res = json['data']['body']['searchResults']['results']
        dlist = make_list_of_dicts(res)
        out += copy.deepcopy(dlist)
    return out


def make_list_of_dicts(res):
    out = []
    for d in res:
        next_d = {}
        for key in d:
            if type(d[key]) == dict:
                for key2 in d[key]:
                    if type(d[key][key2]) == dict:
                        for key3 in d[key][key2]:
                            next_d[key + '-' + key2 + '-' + key3] = d[key][key2][key3]
                    else:
                        next_d[key + '-' + key2] = d[key][key2]
            elif type(d[key]) == list:
                j = 0
                for elem in d[key]:
                    j += 1
                    if type(elem) == dict:
                        for key2 in elem:
                            #print key + '-' + str(j) + '-' + key2
                            #print elem[key2]
                            next_d[key + '-' + str(j) + '-' + key2] = elem[key2]
                    else:
                        next_d[key + '-' + str(j)] = elem
            else:
                next_d[key] = d[key]
        out.append(copy.deepcopy(next_d))
    return out
    

'''
l = len(IDS)

full = []

l = 2

for k in range(l):
    print k, ' OF ', l
    for dates in DATES[:2]
        print dates
        dlist = get_dict(IDS[k],dates[0],dates[1])
        full += copy.deepcopy(dlist)

df = pd.DataFrame(full)

df.to_csv('out.csv')

'''

