### Web Scraping 

In [1]:
# importing some required libraries useful to scrape the website 
import requests
import pandas as pd  

In [2]:
url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M") # 

In [3]:
from bs4 import BeautifulSoup
import lxml.html as hl

In [4]:
doc = hl.fromstring(url.content) # parsing the html content as single  element

In [5]:
tr_elements = doc.xpath("//tr")  # evaluating /tr  element and  storing  it. 

In [6]:
[len(C) for C in tr_elements[:5]]

[3, 3, 3, 3, 3]

In [7]:
tr_elements = doc.xpath('//tr')

#Create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d : %s'%(i,name))
    col.append((name,[]))

1 : Postcode
2 : Borough
3 : Neighbourhood



### Storing all Table content

In [8]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

### Storing dictionary as pandas dataframe 

In [9]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict) 

In [10]:
[len(C) for (title, C) in col]

[288, 288, 288]

In [11]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [12]:
df.dtypes

Postcode           object
Borough            object
Neighbourhood\n    object
dtype: object

In [13]:
df.shape

(288, 3)

### Filtering out unnecessary content  out of  dataframe  

In [14]:
f1 = df['Borough'] == 'Not assigned' 

In [15]:
data = df[~f1] 

In [16]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n


#### Reset the index

In [17]:
data.set_index('Postcode', inplace = True)

In [18]:
data.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods\n
M4A,North York,Victoria Village\n
M5A,Downtown Toronto,Harbourfront\n
M5A,Downtown Toronto,Regent Park\n
M6A,North York,Lawrence Heights\n


In [19]:
data.index.name = 'Postcode'
data.reset_index(inplace = True)

In [20]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n


In [21]:
data.rename(columns={'Neighbourhood\n':'Neighbourhood'}, inplace = True)  #  removing some string part of Neighbourhood\n

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [22]:
data_crop = data["Neighbourhood"]

In [23]:
data.drop("Neighbourhood", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [24]:
data_df = data_crop.to_frame()

In [25]:
data_2 = data_df.Neighbourhood.str.replace("\n","")

In [26]:
type(data_2)

pandas.core.series.Series

In [27]:
cleaned_data = data.merge(data_2.to_frame(), left_index=True, right_index=True)

In [28]:
cleaned_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [29]:
type(cleaned_data)

pandas.core.frame.DataFrame

In [30]:
cleaned_data.shape

(211, 3)

####  Combine  the rows who have same Postcode value

In [31]:
merge_data = cleaned_data.groupby(['Postcode','Borough'], as_index = False, sort= False).agg(','.join)

#### Assign the value at certain position 

In [32]:
merge_data.at[4, 'Neighbourhood'] = 'Queen\'s Park'

In [33]:
merge_data.shape # Filtered datasets shape 

(103, 3)

In [34]:
merge_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [35]:
! pip install geocoder



In [36]:
import geocoder

In [37]:
geocoder.google('Toronto')

<[REQUEST_DENIED] Google - Geocode [empty]>

In [38]:
import os 

In [39]:
pwd

'C:\\Users\\SAIRAM\\Desktop\\Python\\Learn_Git\\Coursera_Capstone-'

In [40]:
cd C:\Users\SAIRAM\Desktop\Python\Learn_Git

C:\Users\SAIRAM\Desktop\Python\Learn_Git


In [41]:
pwd

'C:\\Users\\SAIRAM\\Desktop\\Python\\Learn_Git'

In [42]:
cd C:\\Users\\SAIRAM\\Desktop\\Python\\Learn_Git\\Coursera_Capstone-

C:\Users\SAIRAM\Desktop\Python\Learn_Git\Coursera_Capstone-


In [43]:
df1 = pd.read_csv("Geospatial_Coordinates.csv")

In [44]:
df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [45]:
df1.shape

(103, 3)

In [46]:
df1.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [47]:
df1.rename(columns = {'Postal Code':'Postcode'}, inplace = True)

In [48]:
df1.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
 merge_lat_lang = pd.merge(merge_data, df1, on = 'Postcode')

In [50]:
merge_lat_lang.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


### Identifying exactly how many unique boroughs are  there 

In [51]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(merge_lat_lang['Borough'].unique()),
        merge_lat_lang.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


### Importing Necessary libraries 

In [52]:
import folium # map rendering library

In [53]:
! pip install geopy



In [54]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [55]:
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Use geopy library to get the latitude and longitude values of Toronto .

In [59]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [60]:
# create map of New York using latitude and longitude values
map_tr = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(merge_lat_lang['Latitude'], merge_lat_lang['Longitude'], merge_lat_lang['Borough'], merge_lat_lang['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_tr)  
    
map_tr

In [61]:
merge_lat_lang.Borough.str.contains('Toronto')

0      False
1      False
2       True
3      False
4      False
5      False
6      False
7      False
8      False
9       True
10     False
11     False
12     False
13     False
14     False
15      True
16     False
17     False
18     False
19      True
20      True
21     False
22     False
23     False
24      True
25      True
26     False
27     False
28     False
29     False
       ...  
73      True
74      True
75      True
76     False
77     False
78     False
79      True
80      True
81      True
82     False
83      True
84      True
85     False
86      True
87      True
88     False
89     False
90     False
91      True
92      True
93     False
94     False
95     False
96      True
97      True
98     False
99      True
100     True
101    False
102    False
Name: Borough, Length: 103, dtype: bool

#### Cluster the borough having Toronto name in it.

In [62]:
toronto_data = merge_lat_lang[merge_lat_lang.Borough.str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [63]:
toronto_data.shape

(38, 5)

In [64]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.653963, -79.387207.


In [65]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto