## Importing Libraries 

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

## Using Data from the Internet 

In [2]:
# getting data from internet
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(wikipedia_link).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(raw_wikipedia_page,'xml')
#print(soup.prettify())

## Extraction of Data

In [3]:
# extracting the raw table inside that webpage
table = soup.find('table')

df_list= []
# extracting a clean form of the table
for tr_cell in table.find_all('tr'):
    for td_cell in tr_cell.find_all('td'):
        df_list.append(td_cell.text.strip())
        
df_list

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village',
 'M5A',
 'Downtown Toronto',
 'Harbourfront',
 'M6A',
 'North York',
 'Lawrence Heights',
 'M6A',
 'North York',
 'Lawrence Manor',
 'M7A',
 'Downtown Toronto',
 "Queen's Park",
 'M8A',
 'Not assigned',
 'Not assigned',
 'M9A',
 "Queen's Park",
 'Not assigned',
 'M1B',
 'Scarborough',
 'Rouge',
 'M1B',
 'Scarborough',
 'Malvern',
 'M2B',
 'Not assigned',
 'Not assigned',
 'M3B',
 'North York',
 'Don Mills North',
 'M4B',
 'East York',
 'Woodbine Gardens',
 'M4B',
 'East York',
 'Parkview Hill',
 'M5B',
 'Downtown Toronto',
 'Ryerson',
 'M5B',
 'Downtown Toronto',
 'Garden District',
 'M6B',
 'North York',
 'Glencairn',
 'M7B',
 'Not assigned',
 'Not assigned',
 'M8B',
 'Not assigned',
 'Not assigned',
 'M9B',
 'Etobicoke',
 'Cloverdale',
 'M9B',
 'Etobicoke',
 'Islington',
 'M9B',
 'Etobicoke',
 'Martin Grove',
 'M9B',
 'Et

In [4]:
postcode = df_list[0::3]
Borough = df_list[1::3]
Neighborhood = df_list[2::3]

In [5]:
df_toronto = pd.DataFrame()
df_toronto['postcode'] = postcode
df_toronto['Borough'] = Borough
df_toronto['Neighborhood'] = Neighborhood

df_toronto.head()
df_toronto[df_toronto['postcode']=='M9A']

Unnamed: 0,postcode,Borough,Neighborhood
9,M9A,Queen's Park,Not assigned


## Updating Not Assigned Neighborhoods 

In [6]:
df_toronto2 = df_toronto[df_toronto['Borough']!="Not assigned"]

In [7]:
df_toronto2[df_toronto2['postcode']=='M9A']

Unnamed: 0,postcode,Borough,Neighborhood
9,M9A,Queen's Park,Not assigned


In [8]:
n2 = []
count = 0
for i,j in zip(df_toronto2['Borough'],df_toronto2['Neighborhood']):
    if j == 'Not assigned':
        n2.append(i)
    else:
        n2.append(j)
    count+=1
count

210

In [9]:
len(n2)

210

In [10]:
df_toronto2['Neighborhood'] = n2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [11]:
df_toronto2[df_toronto2['postcode']=='M9A']

Unnamed: 0,postcode,Borough,Neighborhood
9,M9A,Queen's Park,Queen's Park


In [12]:
df_toronto2

Unnamed: 0,postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


## Updating Neighborhoods To List More Than One That Have A Common Postal Code

In [13]:
df_update=df_toronto2.groupby('postcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
df_update=df_update.reset_index(drop=False)
df_update.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [14]:
df_toronto3 = pd.merge(df_toronto2,df_update,on='postcode')

In [15]:
df_toronto3.drop(['Neighborhood'],axis=1,inplace=True)

In [16]:
df_toronto3.drop_duplicates(inplace=True)

In [17]:
df_toronto3.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

In [18]:
df_toronto3.head(10)

Unnamed: 0,postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,"Rouge, Malvern"
9,M3B,North York,Don Mills North
10,M4B,East York,"Woodbine Gardens, Parkview Hill"
12,M5B,Downtown Toronto,"Ryerson, Garden District"


In [19]:
df_toronto3.shape

(103, 3)

In [37]:
def get_geocode(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

In [38]:
df_geo=pd.read_csv('http://cocl.us/Geospatial_data')

In [39]:
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [42]:
df_geo.rename(columns={'Postal Code':'postcode'},inplace=True)
geo_combined = pd.merge(df_geo, df_toronto3, on= 'postcode')

In [44]:
geo_combined.head(10)

Unnamed: 0,postcode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
5,M1J,43.744734,-79.239476,Scarborough,Scarborough Village
6,M1K,43.727929,-79.262029,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,43.711112,-79.284577,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,43.716316,-79.239476,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,43.692657,-79.264848,Scarborough,"Birch Cliff, Cliffside West"


In [47]:
geo_location=geo_combined[['postcode','Borough','Neighborhood','Latitude','Longitude']]
geo_location.head()

Unnamed: 0,postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [50]:
df_toronto4=geo_location[geo_location['Borough'].str.contains("Toronto")]
df_toronto4.head()

Unnamed: 0,postcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
