## Introduction

In this Jupyter notebook we want to scrap the csv example on cars in the Wikipedia page and save the resulting output in the local folder 


### Load the library

In [16]:
# pip install beautifulsoup4

In [17]:
from bs4 import BeautifulSoup
import requests

### Get the HTML page

Load in the html

In [18]:
sim_html = requests.get("https://www.sim.edu.sg/")

#sim_html.text

In [19]:
soup = BeautifulSoup(sim_html.text, 'html.parser')
#soup

##### Show some information about the HTML page

In [20]:
soup.title

<title>Singapore Institute of Management | Degree &amp; Diploma Programmes | Professional Short Courses</title>

In [21]:
title_lst = soup.title.text.split('|')
title_lst

['Singapore Institute of Management ',
 ' Degree & Diploma Programmes ',
 ' Professional Short Courses']

#### Locate the Footer

In [22]:
footer = soup.find('footer')
#footer

#### Extract the title

In [23]:
titles = footer.find_all('p', class_='title')
titles

[<p class="title"><strong>SIM Global Education</strong></p>,
 <p class="title">SIM Academy</p>]

##### Extract all the 'Span' tags

In [24]:
spans = footer.find_all('span')
spans

[<span>Email: </span>,
 <span> <a href="mailto:study@sim.edu.sg">study@sim.edu.sg</a> </span>,
 <span>Phone: </span>,
 <span> <a href="tel:6248 9746">6248 9746</a><br/>
 <span class="font-small">(Mon to Fri, 9.00am - 5.00pm)<br/>
 (Closed on Sat, Sun, public holidays and SIM shutdown days - Christmas Eve, New Year’s Day Eve and Chinese New Year Eve, 3rd day of Chinese New Year)</span></span>,
 <span class="font-small">(Mon to Fri, 9.00am - 5.00pm)<br/>
 (Closed on Sat, Sun, public holidays and SIM shutdown days - Christmas Eve, New Year’s Day Eve and Chinese New Year Eve, 3rd day of Chinese New Year)</span>,
 <span>Address: </span>,
 <span>461 Clementi Road, Singapore 599491<br/>
 <span class="font-small">(Daily except public holidays, 6am to 11.59pm)</span><br/>
 <a href="https://g.page/simglobaleducation?share" target="_blank">Getting here</a></span>,
 <span class="font-small">(Daily except public holidays, 6am to 11.59pm)</span>,
 <span>Email: </span>,
 <span> <a href="mailto:simaca

##### Extract the information among the Spans

In [25]:
index = 0
name_lst = []
email_lst = []
phone_lst = []
address_lst = []

for title in titles:
    name_lst.append(title.text)

for span in spans:
    if ("Email" in span.text):
        email = spans[index+1].text
        email = email.strip()
        email_lst.append(email)

    if ("Phone" in span.text):
        phone = spans[index+1].text
        phone_end = phone.find("(")
        phone = phone[:phone_end]
        phone = phone.strip()
        phone_lst.append(phone)

    if ("Address" in span.text):
        address = spans[index+1].text
        address_end = address.find("(")
        address = address[:address_end]
        address = address.strip()
        address_lst.append(address)

    index += 1

In [26]:
phone_lst.append("") # to patch the missing data

print(name_lst)
print(email_lst)
print(phone_lst)
print(address_lst)

['SIM Global Education', 'SIM Academy']
['study@sim.edu.sg', 'simacademy@sim.edu.sg']
['6248 9746', '']
['461 Clementi Road, Singapore 599491', '41 Namly Avenue, Singapore 267616\nMon- Fri']


#### To create contact dictionary

In [27]:
contact_dict = {'Name':name_lst, 'Email':email_lst, 'Phone':phone_lst, 'Address':address_lst}
contact_dict

{'Name': ['SIM Global Education', 'SIM Academy'],
 'Email': ['study@sim.edu.sg', 'simacademy@sim.edu.sg'],
 'Phone': ['6248 9746', ''],
 'Address': ['461 Clementi Road, Singapore 599491',
  '41 Namly Avenue, Singapore 267616\nMon- Fri']}

##### From Dictionary, create data frame

In [28]:
import pandas as pd

contact_df = pd.DataFrame(contact_dict)
contact_df

Unnamed: 0,Name,Email,Phone,Address
0,SIM Global Education,study@sim.edu.sg,6248 9746,"461 Clementi Road, Singapore 599491"
1,SIM Academy,simacademy@sim.edu.sg,,"41 Namly Avenue, Singapore 267616\nMon- Fri"


#### Export data frame to csv file

In [29]:
contact_df.to_csv("contact.csv", index=False)