In [1]:
# import the Requests HTTP library
import requests

# import the Beautiful Soup module 
from bs4 import BeautifulSoup

#import csv for file saving
import csv

# 1\. Initial Tests to understand the HTML structure for Webscraping

"requests" allows us to establish access to a given html.  Then, BeautifulSoup is used to parse html in a readable format.  This is used throughout scraping process, so a function "soupify" is written.  

useful sources: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [2]:
r = requests.get("https://www.tripadvisor.com/Attraction_Review-g295424-d676922-Reviews-Burj_Khalifa-Dubai_Emirate_of_Dubai.html")

In [3]:
soup = BeautifulSoup(r.content, 'html.parser')

In [4]:
s = soup.prettify()

In [5]:
print soup.title

<title>Burj Khalifa (Dubai, United Arab Emirates): Top Tips Before You Go (with Photos) - TripAdvisor</title>


In [6]:
def soupify(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    soup.prettify()
    return r, soup

In [7]:
r, soup = soupify("https://www.tripadvisor.com/Attraction_Review-g295424-d676922-Reviews-Burj_Khalifa-Dubai_Emirate_of_Dubai.html")

In [8]:
#testing to verify if the function has worked
soup.title

<title>Burj Khalifa (Dubai, United Arab Emirates): Top Tips Before You Go (with Photos) - TripAdvisor</title>

# 2\. Scraping TripAdvisor

For this project, the reviews for TripAdvisor is scraped.  The reviews are marked with anchor tag 'a' and the class label 'review_count' is used to access urls for reviews.  The review addresses are then stored in a list format.  The function 'rev_address' is defined for this purpose.  As an initial step, the reviews for Botswana are scraped.

In [9]:
r, soup = soupify("https://www.tripadvisor.com/Attractions-g293766-Activities-Botswana.html")

In [10]:
def rev_address(r, soup):
    r_links = []
    for a in soup.find_all('a','review_count'):
        r_links.append(a['href'])
    return r_links

In [11]:
r_links = rev_address(r,soup)

In [12]:
r_links

[u'/Attraction_Review-g317055-d1643251-Reviews-Okavango_Delta-Maun_North_West_District.html#REVIEWS',
 u'/Attraction_Review-g316101-d553850-Reviews-Moremi_Wildlife_Reserve-Moremi_Game_Reserve_Okavango_Delta_North_West_District.html#REVIEWS',
 u'/Attraction_Review-g297318-d12172119-Reviews-Chobe_National_Park-Kasane_North_West_District.html#REVIEWS',
 u'/Attraction_Review-g472669-d478945-Reviews-Savute_Reserve-Chobe_National_Park_North_West_District.html#REVIEWS',
 u'/Attraction_Review-g293767-d3396948-Reviews-Three_Chiefs_Statues-Gaborone_South_East_District.html#REVIEWS',
 u'/Attraction_Review-g472673-d1171922-Reviews-Khwai_River_Bridge-Okavango_Delta_North_West_District.html#REVIEWS',
 u'/Attraction_Review-g480161-d311243-Reviews-Tsodilo_Hills-Shakawe_North_West_District.html#REVIEWS',
 u'/Attraction_Review-g293767-d3247057-Reviews-ISKCON_Gaborone-Gaborone_South_East_District.html#REVIEWS',
 u'/Attraction_Review-g317055-d1643251-Reviews-Okavango_Delta-Maun_North_West_District.html#RE

Since these urls are not in an acceptable format for requests, we need to concatenate the base url to these addresses.  Function 'format_url' is defined for this purpose

In [13]:
def format_url(r_links):
    base = 'http://www.tripadvisor.com{}'
    for i in range(len(r_links)):
        r_links[i] = base.format(r_links[i])
    return r_links

In [14]:
#to confirm concatenation
r_links = format_url(r_links)

In [15]:
print("\n".join(r_links))

http://www.tripadvisor.com/Attraction_Review-g317055-d1643251-Reviews-Okavango_Delta-Maun_North_West_District.html#REVIEWS
http://www.tripadvisor.com/Attraction_Review-g316101-d553850-Reviews-Moremi_Wildlife_Reserve-Moremi_Game_Reserve_Okavango_Delta_North_West_District.html#REVIEWS
http://www.tripadvisor.com/Attraction_Review-g297318-d12172119-Reviews-Chobe_National_Park-Kasane_North_West_District.html#REVIEWS
http://www.tripadvisor.com/Attraction_Review-g472669-d478945-Reviews-Savute_Reserve-Chobe_National_Park_North_West_District.html#REVIEWS
http://www.tripadvisor.com/Attraction_Review-g293767-d3396948-Reviews-Three_Chiefs_Statues-Gaborone_South_East_District.html#REVIEWS
http://www.tripadvisor.com/Attraction_Review-g472673-d1171922-Reviews-Khwai_River_Bridge-Okavango_Delta_North_West_District.html#REVIEWS
http://www.tripadvisor.com/Attraction_Review-g480161-d311243-Reviews-Tsodilo_Hills-Shakawe_North_West_District.html#REVIEWS
http://www.tripadvisor.com/Attraction_Review-g293767-d

# 3\. Scraping Reviews for Okavango Delta trips from Botswana category

In [16]:
r_links[0]

'http://www.tripadvisor.com/Attraction_Review-g317055-d1643251-Reviews-Okavango_Delta-Maun_North_West_District.html#REVIEWS'

In [17]:
r, soup = soupify(r_links[0])

By inspecting above url, I notived that the reviews are within tag 'p', so I select for it: 

In [18]:
p_list = soup.find_all('p', 'partial_entry')

Below prints out an example entry of a review:

In [19]:
p_list[3]

<p class="partial_entry">Wild Africa at its best! This is an amazing place and it was a huge privilege to visit and stay within it at Gunns Concession.\nWe arrived by air from Maun - in a four seat plane (including the pilot)! It was so exciting! And...<span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickExpand',event,this);">More</span></p>

For each url, there are 10 reviews stored:

In [20]:
len(p_list)

10

Then, this is something we can use repeatedly as well, so we define a function:

In [21]:
def list_reviews(r, soup):
    p_list = soup.find_all('p', 'partial_entry')
    return p_list

In [22]:
reviews = list_reviews(r, soup)

In [23]:
#need to revise below:

In [24]:
for i in range(len(p_list)):
    p_list[i] = p_list[i].string

In [25]:
p_list

[u'wonderfull delta, peace and quit, lots of birds, and elephants. Nice guide, nice walk, nice food, really good to do !',
 u'We camped overnight on an island in the delta. The locals provided small boats (mokoro) and poled us out to the camp. From the camp we did wildlife viewing walks. Simply amazing.',
 u'The place has a dreamy feel about it. The game is good and there is a variety of accommodation you can consider from high end to overland trucking/tenting. A flight over the delta gives you the best impression of its expanse.',
 None,
 u'The diversity of the area has to be seen and experienced (you will not be disappointed), as it is truly nature (and Africa) at its best.\nIt will leave you wanting more, much more!!!',
 None,
 None,
 None,
 None,
 u'Is a amazing experience! I did the airplane and the next day the helicopter and is speechless. I recommend this activities. Helicopter is more expensive but worth it!']

While looking for more reviews, I noticed that 10 sets of reviews are stored in separate urls marked by 'Reviews-or10', 'Reviews-or20', 'Reviews-or30', and so on.  Then, we need to manipulate urls for right links.  This is something that could be done repeatedly as well, so we may define a function

In [26]:
ind = r_links[0].find('Reviews-')
ind

62

In [27]:
link_new = r_links[0][:ind+8]+'or10-'+r_links[0][ind+8:]
print link_new

http://www.tripadvisor.com/Attraction_Review-g317055-d1643251-Reviews-or10-Okavango_Delta-Maun_North_West_District.html#REVIEWS


In [28]:
def pgs_reviews_url(link_old, page):
    ind = r_links[0].find('Reviews-')
    link_new = r_links[0][:ind+8]+page+r_links[0][ind+8:]
    return link_new

In [29]:
link_new = pgs_reviews_url(r_links[0], 'or10-')
print link_new

http://www.tripadvisor.com/Attraction_Review-g317055-d1643251-Reviews-or10-Okavango_Delta-Maun_North_West_District.html#REVIEWS


In [30]:
r, soup = soupify(link_new)

In [31]:
pg2_reviews = list_reviews(r, soup)

In [32]:
pg2_reviews[0]

<p class="partial_entry">From different people I heard a scenic flight over the Okavango Delta is a must do. After contacting some companies, which were fully booked, I ended up with Kavango Air. On the telephone they said a one hour flight costs P3500 for two people. That's...<span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickExpand',event,this);">More</span></p>

In [33]:
#to verify that the next page also contains 10 reviews
len(pg2_reviews)

10

In [34]:
def pg_urltag(endpg):
    '''input endpg is the end page number
    this function returns all url tags for 
    pages of url
    '''
    pgs = ['']

    for i in range(1,endpg):
        pgs.append('or{}0-'.format(i))
    return pgs

In [35]:
pg_url = pg_urltag(20)

In [36]:
pg_url

['',
 'or10-',
 'or20-',
 'or30-',
 'or40-',
 'or50-',
 'or60-',
 'or70-',
 'or80-',
 'or90-',
 'or100-',
 'or110-',
 'or120-',
 'or130-',
 'or140-',
 'or150-',
 'or160-',
 'or170-',
 'or180-',
 'or190-']

# Moving Global: Countries!

Eventually, we want to scrape for the selected countries.  I selected the below countries, and a map shows the selected regions in blue:
![Selected countries for this project](img/amCharts.png)

The countries with travel warnings are avoided: see the state department travel warnings for 2017.  Example countries are Syria (terrorism threats), Central Africa (wars), North Korea, etc. 
** (map doesnt contain Gambia.  Liberia removed and Gambia added) ** 

Places are picked using resources below:
   - 1000 places to see before you die, patricia shultz
   - tripadvisor's top 25 rising destinations of 2017
   - personal taste

Gathered lists of urls by continents are defined in this section.

In [37]:
oceania = ['https://www.tripadvisor.com/Attractions-g255055-Activities-Australia.html',
          'https://www.tripadvisor.com/Attractions-g294338-Activities-French_Polynesia.html',
          'https://www.tripadvisor.com/Attractions-g255104-Activities-New_Zealand.html',
          'https://www.tripadvisor.com/Attractions-g294115-Activities-Papua_New_Guinea.html'
          ]

In [38]:
len(oceania)

4

In [39]:
asia = ['https://www.tripadvisor.com/Attractions-g293939-Activities-Cambodia.html',
       'https://www.tripadvisor.com/Attractions-g294211-Activities-China.html',
       'https://www.tripadvisor.com/Attractions-g293860-Activities-India.html',
       'https://www.tripadvisor.com/Attractions-g294225-Activities-Indonesia.html',
       'https://www.tripadvisor.com/Attractions-g293977-Activities-Israel.html',
       'https://www.tripadvisor.com/Attractions-g294232-Activities-Japan.html',
       'https://www.tripadvisor.com/Attractions-g293951-Activities-Malaysia.html',
       'https://www.tripadvisor.com/Attractions-g293955-Activities-Mongolia.html',
       'https://www.tripadvisor.com/Attractions-g293889-Activities-Nepal.html',
       'https://www.tripadvisor.com/Attractions-g294006-Activities-Oman.html',
       'https://www.tripadvisor.com/Attractions-g294196-Activities-South_Korea.html',
       'https://www.tripadvisor.com/Attractions-g293915-Activities-Thailand.html',
       'https://www.tripadvisor.com/Attractions-g294012-Activities-United_Arab_Emirates.html'
       ]

In [40]:
len(asia)

13

In [41]:
africa = ['https://www.tripadvisor.com/Attractions-g293717-Activities-Algeria.html',
         'https://www.tripadvisor.com/Attractions-g293762-Activities-Angola.html',
         'https://www.tripadvisor.com/Attractions-g293766-Activities-Botswana.html',
         'https://www.tripadvisor.com/Attractions-g294192-Activities-Cote_d_Ivoire.html',
         'https://www.tripadvisor.com/Attractions-g294200-Activities-Egypt.html',
         'https://www.tripadvisor.com/Attractions-g294206-Activities-Kenya.html',
         'https://www.tripadvisor.com/Attractions-g293794-Activities-Gambia.html',
         'https://www.tripadvisor.com/Attractions-g293808-Activities-Madagascar.html',
         'https://www.tripadvisor.com/Attractions-g293730-Activities-Morocco.html',
         'https://www.tripadvisor.com/Attractions-g293820-Activities-Namibia.html',
         'https://www.tripadvisor.com/Attractions-g293830-Activities-Senegal.html',
         'https://www.tripadvisor.com/Attractions-g293832-Activities-Sierra_Leone.html',
         'https://www.tripadvisor.com/Attractions-g293740-Activities-South_Africa.html',
         'https://www.tripadvisor.com/Attractions-g293747-Activities-Tanzania.html'
         ]
#possibly get rid of Liberia?

In [42]:
len(africa)

14

In [43]:
sa = ['https://www.tripadvisor.com/Attractions-g294266-Activities-Argentina.html',
      'https://www.tripadvisor.com/Attractions-g294280-Activities-Brazil.html',
      'https://www.tripadvisor.com/Attractions-g294291-Activities-Chile.html',
      'https://www.tripadvisor.com/Attractions-g294311-Activities-Peru.html'
     ]

In [44]:
len(sa)

4

In [45]:
na = ['https://www.tripadvisor.com/Attractions-g291959-Activities-Belize.html',
      'https://www.tripadvisor.com/Attractions-g153339-Activities-Canada.html',
      'https://www.tripadvisor.com/Attractions-g147364-Activities-Cayman_Islands.html',
      'https://www.tripadvisor.com/Attractions-g291982-Activities-Costa_Rica.html',
      'https://www.tripadvisor.com/Attractions-g147270-Activities-Cuba.html',
      'https://www.tripadvisor.com/Attractions-g147277-Activities-Curacao.html',
      'https://www.tripadvisor.com/Attractions-g295111-Activities-Greenland.html',
      'https://www.tripadvisor.com/Attractions-g147295-Activities-Grenada.html',
      'https://www.tripadvisor.com/Attractions-g292016-Activities-Honduras.html',
      'https://www.tripadvisor.com/Attractions-g150768-Activities-Mexico.html',
      'https://www.tripadvisor.com/Attractions-g294477-Activities-Nicaragua.html',
      'https://www.tripadvisor.com/Attractions-g294479-Activities-Panama.html',
      'https://www.tripadvisor.com/Attractions-g191-Activities-United_States.html'
     ]
#cuba?!
#big countries - US, CANADA

In [46]:
len(na)

13

In [47]:
europe = ['https://www.tripadvisor.com/Attractions-g190410-Activities-Austria.html',
      'https://www.tripadvisor.com/Attractions-g294451-Activities-Bulgaria.html',
      'https://www.tripadvisor.com/Attractions-g274684-Activities-Czech_Republic.html',
      'https://www.tripadvisor.com/Attractions-g189512-Activities-Denmark.html',
      'https://www.tripadvisor.com/Attractions-g274952-Activities-Estonia.html',
      'https://www.tripadvisor.com/Attractions-g189896-Activities-Finland.html',
      'https://www.tripadvisor.com/Attractions-g187070-Activities-France.html',
      'https://www.tripadvisor.com/Attractions-g187275-Activities-Germany.html',
      'https://www.tripadvisor.com/Attractions-g189398-Activities-Greece.html',
      'https://www.tripadvisor.com/Attractions-g274881-Activities-Hungary.html',
      'https://www.tripadvisor.com/Attractions-g189952-Activities-Iceland.html',
      'https://www.tripadvisor.com/Attractions-g187768-Activities-Italy.html',
      'https://www.tripadvisor.com/Attractions-g190455-Activities-Norway.html',
      'https://www.tripadvisor.com/Attractions-g189100-Activities-Portugal.html',
      'https://www.tripadvisor.com/Attractions-g294457-Activities-Romania.html',
      'https://www.tripadvisor.com/Attractions-g294459-Activities-Russia.html',
      'https://www.tripadvisor.com/Attractions-g294471-Activities-Serbia.html',
      'https://www.tripadvisor.com/Attractions-g187427-Activities-Spain.html',
      'https://www.tripadvisor.com/Attractions-g189806-Activities-Sweden.html',
      'https://www.tripadvisor.com/Attractions-g293969-Activities-Turkey.html',
      'https://www.tripadvisor.com/Attractions-g186216-Activities-United_Kingdom.html'
     ]

In [48]:
len(europe)

21

In [49]:
len(oceania)+len(na)+len(sa)+len(africa)+len(asia)+len(europe)

69

In [50]:
69*10000

690000

# Data Organization

## Saving URLs to a CSV file

In [51]:
oceania_label = ['oceania']*4
asia_label = ['asia']*13
africa_label = ['africa']*14
sa_label = ['sa']*4
na_label = ['na']*13
europe_label = ['europe']*21

In [52]:
zip(oceania_label, oceania)

[('oceania',
  'https://www.tripadvisor.com/Attractions-g255055-Activities-Australia.html'),
 ('oceania',
  'https://www.tripadvisor.com/Attractions-g294338-Activities-French_Polynesia.html'),
 ('oceania',
  'https://www.tripadvisor.com/Attractions-g255104-Activities-New_Zealand.html'),
 ('oceania',
  'https://www.tripadvisor.com/Attractions-g294115-Activities-Papua_New_Guinea.html')]

In [53]:
test = open('test.csv', 'wb')

In [54]:
file_writer = csv.writer(test)

In [55]:
for x in zip(oceania_label, oceania):
    file_writer.writerow(x)

In [56]:
def extract_country(x):
    return x.split('-')[-1].split('.')[0]

In [57]:
oceania_countries=[]
for x in oceania:
    oceania_countries.append(extract_country(x))
    
asia_countries=[]
for x in asia:
    asia_countries.append(extract_country(x))

africa_countries=[]
for x in africa:
    africa_countries.append(extract_country(x))
    
na_countries=[]
for x in na:
    na_countries.append(extract_country(x))
    
sa_countries=[]
for x in sa:
    sa_countries.append(extract_country(x))
    
europe_countries=[]
for x in europe:
    europe_countries.append(extract_country(x))

In [58]:
oceania_total = zip(oceania_label, oceania_countries, oceania)
asia_total = zip(asia_label, asia_countries, asia)
africa_total = zip(africa_label, africa_countries, africa)
sa_total = zip(sa_label, sa_countries, sa)
na_total = zip(na_label, na_countries, na)
europe_total = zip(europe_label, europe_countries, europe)

In [59]:
with open('world.csv','wb') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['continent','country','url'])
    for row in oceania_total:
        csv_out.writerow(row)
    for row in asia_total:
        csv_out.writerow(row)
    for row in africa_total:
        csv_out.writerow(row)
    for row in sa_total:
        csv_out.writerow(row)
    for row in na_total:
        csv_out.writerow(row)
    for row in europe_total:
        csv_out.writerow(row)

In [60]:
pd.read_csv('data/world.csv')
#to check

Unnamed: 0,continent,country,url
0,oceania,Australia,https://www.tripadvisor.com/Attractions-g25505...
1,oceania,French_Polynesia,https://www.tripadvisor.com/Attractions-g29433...
2,oceania,New_Zealand,https://www.tripadvisor.com/Attractions-g25510...
3,oceania,Papua_New_Guinea,https://www.tripadvisor.com/Attractions-g29411...
4,asia,Cambodia,https://www.tripadvisor.com/Attractions-g29393...
5,asia,China,https://www.tripadvisor.com/Attractions-g29421...
6,asia,India,https://www.tripadvisor.com/Attractions-g29386...
7,asia,Indonesia,https://www.tripadvisor.com/Attractions-g29422...
8,asia,Israel,https://www.tripadvisor.com/Attractions-g29397...
9,asia,Japan,https://www.tripadvisor.com/Attractions-g29423...
