# Project 3 - 
## Deliverable 1: Scrape Titles Locations from Wikipedia

In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd 
import requests

In [2]:
browser = Browser('chrome')

In [3]:
# Visit website
url = 'https://en.wikipedia.org/wiki/List_of_National_Historic_Landmarks_in_New_York_City'
browser.visit(url)

### Step 2: Scrape the Website

Create a Beautiful Soup object and use it to extract text elements from the website.

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_National_Historic_Landmarks_in_New_York_City"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-typography-survey-disabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of National Historic Landmarks in New York City - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clien

In [6]:
# Extracting all location names and geographical coordinates
locations = []
geo_coords = []
desc_list = []
County = []
lat=[]
long=[]
location_elements = soup.find_all('span', class_='mapframe-coord-name')
for location_element in location_elements:
    location_name = location_element.a.text.strip()
    locations.append(location_name)
    
    geo_coords_element = location_element.find_next('span', class_='geo-default')
    if geo_coords_element:
        geo_coords_str = geo_coords_element.span.text.strip()
        geo_coords_list = geo_coords_str.split(' ')
        if len(geo_coords_list) >= 2:
            latitude, longitude = map(str, geo_coords_list[:2])
            lat.append(latitude)
            long.append(longitude)
            geo_coords.append({'Latitude': latitude, 'Longitude': longitude})
        else:
            geo_coords.append({'Latitude': 'N/A', 'Longitude': 'N/A'}) 
            
    else:
        geo_coords.append({'Latitude': 'N/A', 'Longitude': 'N/A'})
        
    description = location_element.find_next('td', class_='note').text
    desc_list.append(description)
    
    county_name = location_element.find_next('a', class_='mw-redirect').text
    County.append(county_name)

In [7]:
len(lat),len(long),len(locations),len(desc_list)

(116, 116, 116, 116)

In [8]:
# Creating a DataFrame
df = pd.DataFrame({
    'Location Name': locations,
    'Latitude': lat,
    'Longitude': long,
    'Description': desc_list,
    'County':County
})

In [9]:
import re
def convert_to_decimal(lat_long_string):
    match = re.match(r'(\d+)°(\d+)′(\d+)″([NSWE])', lat_long_string)
    if match:
        degrees, minutes, seconds, direction = match.groups()
        decimal_degrees = int(degrees) + int(minutes) / 60 + int(seconds) / 3600
        if direction in ['S', 'W']:
            decimal_degrees *= -1
        return decimal_degrees
    else:
        return pd.NA

In [10]:
df['Latitude']=df['Latitude'].apply(convert_to_decimal)

In [11]:
df['Latitude'].isna().sum()

1

In [12]:
df['Longitude']=df['Longitude'].apply(convert_to_decimal)

In [13]:
df['Latitude']

0      40.741667
1      40.892222
2      40.714444
3      40.704722
4      40.708889
         ...    
111    40.498889
112    40.889167
113    40.712222
114    40.644444
115    40.610833
Name: Latitude, Length: 116, dtype: object

In [15]:
df['Latitude'].value_counts()

Latitude
40.834444    2
40.706111    2
40.727500    2
40.764722    2
40.706389    2
            ..
40.762500    1
40.709722    1
40.748333    1
40.832222    1
40.610833    1
Name: count, Length: 103, dtype: int64

In [16]:
df.head()

Unnamed: 0,Location Name,Latitude,Longitude,Description,County
0,69th Regiment Armory,40.741667,-73.983611,"Home of the watershed Armory Show in 1913, whi...",New York
1,Admiral David Glasgow Farragut Gravesite,40.892222,-73.865833,Only intact known property directly associated...,Bronx
2,African Burial Ground,40.714444,-74.004444,"Dedicated as National Monument on October 5, 2...",New York
3,Ambrose (lightship),40.704722,-74.0025,"Lightship, several miles offshore, that marked...",Ambrose (lightship)
4,American Stock Exchange,40.708889,-74.0125,Former headquarters of the American Stock Exch...,New York


In [17]:
# Lists to store extracted information
locations = []
geo_coordinates = []
desc_list = []

In [18]:
# Extracting the location name
location_name = soup.find('span', class_='mapframe-coord-name').a.text.strip()
locations.append(location_name)

In [19]:
# Extracting the geographical coordinates
geo_coords_str = soup.find('span', class_='geo-default').span.text.strip()

In [20]:
description = soup.find('td', class_='note').a.text.strip()
desc_list.append(description)

In [21]:
# Splitting the geo_coords string into latitude and longitude
latitude, longitude = map(str, geo_coords_str.split(' '))

In [22]:
# Creating a DataFrame
df1 = pd.DataFrame({
    'Location Name': locations,
    'Latitude': [latitude],
    'Longitude': [longitude],
    'Description': description
})

In [23]:
df1

Unnamed: 0,Location Name,Latitude,Longitude,Description
0,69th Regiment Armory,40°44′30″N,73°59′01″W,Armory Show


In [24]:
# Extract all the landmark names
landmark_name = soup.find_all('span', class_='mapframe-coord-name')
landmark_name

[<span class="mapframe-coord-name"><a href="/wiki/69th_Regiment_Armory" title="69th Regiment Armory">69th Regiment Armory</a></span>,
 <span class="mapframe-coord-name"><a href="/wiki/Admiral_David_Glasgow_Farragut_Gravesite" title="Admiral David Glasgow Farragut Gravesite">Admiral David Glasgow Farragut Gravesite</a></span>,
 <span class="mapframe-coord-name"><a href="/wiki/African_Burial_Ground_National_Monument" title="African Burial Ground National Monument">African Burial Ground</a></span>,
 <span class="mapframe-coord-name"><a class="mw-redirect" href="/wiki/United_States_Lightship_LV-87" title="United States Lightship LV-87">Ambrose (lightship)</a></span>,
 <span class="mapframe-coord-name"><a href="/wiki/American_Stock_Exchange_Building" title="American Stock Exchange Building">American Stock Exchange</a></span>,
 <span class="mapframe-coord-name"><a href="/wiki/Louis_Armstrong_House" title="Louis Armstrong House">Louis Armstrong House</a></span>,
 <span class="mapframe-coord-n

In [25]:
# Extract all the location/borough 
location = soup.find_all('span', class_='label')
location

[<span class="label"><a href="/wiki/Manhattan" title="Manhattan">Manhattan</a></span>,
 <span class="label"><a href="/wiki/The_Bronx" title="The Bronx">Bronx</a></span>,
 <span class="label"><a href="/wiki/Manhattan" title="Manhattan">Manhattan</a></span>,
 <span class="label"><a href="/wiki/Manhattan" title="Manhattan">Manhattan</a></span>,
 <span class="label"><a href="/wiki/Manhattan" title="Manhattan">Manhattan</a></span>,
 <span class="label"><a href="/wiki/Corona,_Queens" title="Corona, Queens">Corona</a></span>,
 <span class="label"><a href="/wiki/Manhattan" title="Manhattan">Manhattan</a></span>,
 <span class="label"><a href="/wiki/Rosebank,_Staten_Island" title="Rosebank, Staten Island">Rosebank</a></span>,
 <span class="label"><a href="/wiki/Pelham_Bay_Park" title="Pelham Bay Park">Pelham Bay Park</a></span>,
 <span class="label"><a href="/wiki/Manhattan" title="Manhattan">Manhattan</a></span>,
 <span class="label"><a href="/wiki/Manhattan" title="Manhattan">Manhattan</a></sp

In [26]:
# Extract all the latitude + longitude  
lat = soup.find_all('span', class_='latitude')
lat

[<span class="latitude">40°44′30″N</span>,
 <span class="latitude">40°53′32″N</span>,
 <span class="latitude">40°42′52″N</span>,
 <span class="latitude">40°42′17″N</span>,
 <span class="latitude">40°42′32″N</span>,
 <span class="latitude">40°45′16″N</span>,
 <span class="latitude">40°44′34″N</span>,
 <span class="latitude">40°36′54″N</span>,
 <span class="latitude">40°52′18″N</span>,
 <span class="latitude">40°43′35″N</span>,
 <span class="latitude">40°44′13″N</span>,
 <span class="latitude">40°42′23″N</span>,
 <span class="latitude">40°41′48″N</span>,
 <span class="latitude">40°41′41″N</span>,
 <span class="latitude">40°42′23″N</span>,
 <span class="latitude">40°45′54″N</span>,
 <span class="latitude">40°47′04″N</span>,
 <span class="latitude">40°46′55″N</span>,
 <span class="latitude">40°45′35″N</span>,
 <span class="latitude">40°42′34″N</span>,
 <span class="latitude">40°45′06″N</span>,
 <span class="latitude">40°44′01″N</span>,
 <span class="latitude">40°42′46″N</span>,
 <span clas

In [27]:
long = soup.find_all('span', class_='longitude')
long

[<span class="longitude">73°59′01″W</span>,
 <span class="longitude">73°51′57″W</span>,
 <span class="longitude">74°00′16″W</span>,
 <span class="longitude">74°00′09″W</span>,
 <span class="longitude">74°00′45″W</span>,
 <span class="longitude">73°51′42″W</span>,
 <span class="longitude">73°58′56″W</span>,
 <span class="longitude">74°03′47″W</span>,
 <span class="longitude">73°48′20″W</span>,
 <span class="longitude">73°59′44″W</span>,
 <span class="longitude">74°00′36″W</span>,
 <span class="longitude">73°59′51″W</span>,
 <span class="longitude">73°59′48″W</span>,
 <span class="longitude">73°59′34″W</span>,
 <span class="longitude">73°50′13″W</span>,
 <span class="longitude">73°58′49″W</span>,
 <span class="longitude">73°57′28″W</span>,
 <span class="longitude">73°57′58″W</span>,
 <span class="longitude">73°58′14″W</span>,
 <span class="longitude">74°00′36″W</span>,
 <span class="longitude">73°58′31″W</span>,
 <span class="longitude">73°59′44″W</span>,
 <span class="longitude">74°00′2

In [28]:
# Extract all the descriptions of each landmark 
landmark_description = soup.find_all('td', class_='note')
landmark_description

[<td class="note">Home of the watershed <a href="/wiki/Armory_Show" title="Armory Show">Armory Show</a> in 1913, which introduced America to <a href="/wiki/Modern_art" title="Modern art">modern art</a>
 </td>,
 <td class="note">Only intact known property directly associated with Admiral <a href="/wiki/David_Farragut" title="David Farragut">David Farragut</a>
 </td>,
 <td class="note">Dedicated as National Monument on October 5, 2007; burial site in Lower <a href="/wiki/Manhattan" title="Manhattan">Manhattan</a> of over 419 Africans from 1690s to 1794
 </td>,
 <td class="note">Lightship, several miles offshore, that marked <a href="/wiki/Ambrose_Channel" title="Ambrose Channel">Ambrose Channel</a> into New York Harbor, now at <a href="/wiki/South_Street_Seaport" title="South Street Seaport">South Street Seaport</a> Museum.
 </td>,
 <td class="note">Former headquarters of the <a class="mw-redirect" href="/wiki/American_Stock_Exchange" title="American Stock Exchange">American Stock Exchan

In [29]:
all_elems = soup.find_all('tr', class_='vcard')
all_elems

[<tr class="vcard">
 <th style="background-color: #87CEEB;"><small>1</small>
 </th>
 <td><span class="mapframe-coord-name"><a href="/wiki/69th_Regiment_Armory" title="69th Regiment Armory">69th Regiment Armory</a></span>
 </td>
 <td><figure class="mw-halign-center mw-image-border" typeof="mw:File"><a class="mw-file-description" href="/wiki/File:69th_Regiment_Armory_(51710072774).jpg" title="69th Regiment Armory"><img alt="69th Regiment Armory" class="mw-file-element" data-file-height="3888" data-file-width="3995" decoding="async" height="117" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/e4/69th_Regiment_Armory_%2851710072774%29.jpg/120px-69th_Regiment_Armory_%2851710072774%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/e4/69th_Regiment_Armory_%2851710072774%29.jpg/180px-69th_Regiment_Armory_%2851710072774%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/e4/69th_Regiment_Armory_%2851710072774%29.jpg/240px-69th_Regiment_Armory_%2851710072774%29.jpg

In [30]:
# Create an empty list to store the dictionaries
newyorklandmarks_list = []

In [31]:
# Loop through elements
# Extract the landmark name, locaiton, lat & long, and description and preview text from the elements
# Store each title and preview pair in a dictionary
# Add the dictionary to the list

for l in all_elems:
        
    name = l.find('span', class_='mapframe-coord-name').text
        
    location = l.find('span', class_='label').text
    
    latitude = l.find('span', class_='latitude').text
    
    longitude = l.find('span', class_='longitude').text
    
    description = l.find('td', class_='note').text

        
    ny_dict = {
        "name": landmark_name,
        "location": location,
        "latitude": lat,
        "longitude": long, 
        "description": landmark_description
    }
        
    newyorklandmarks_list.append(ny_dict)

In [32]:
newyorklandmarks_list

[{'name': [<span class="mapframe-coord-name"><a href="/wiki/69th_Regiment_Armory" title="69th Regiment Armory">69th Regiment Armory</a></span>,
   <span class="mapframe-coord-name"><a href="/wiki/Admiral_David_Glasgow_Farragut_Gravesite" title="Admiral David Glasgow Farragut Gravesite">Admiral David Glasgow Farragut Gravesite</a></span>,
   <span class="mapframe-coord-name"><a href="/wiki/African_Burial_Ground_National_Monument" title="African Burial Ground National Monument">African Burial Ground</a></span>,
   <span class="mapframe-coord-name"><a class="mw-redirect" href="/wiki/United_States_Lightship_LV-87" title="United States Lightship LV-87">Ambrose (lightship)</a></span>,
   <span class="mapframe-coord-name"><a href="/wiki/American_Stock_Exchange_Building" title="American Stock Exchange Building">American Stock Exchange</a></span>,
   <span class="mapframe-coord-name"><a href="/wiki/Louis_Armstrong_House" title="Louis Armstrong House">Louis Armstrong House</a></span>,
   <span c

In [None]:
df.to_excel('Extracted_Landmark_data.xlsx')

In [None]:
browser.quit()