## Scrapping data with python 

In [1]:
from bs4 import BeautifulSoup
import requests

SCRAPPING DATA WITH BEAUTIFUL SOUP

In [2]:
url_to_scrap = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url_to_scrap)
plane_text= source.text
soup = BeautifulSoup(plane_text)

soup_table = soup.find('table',{'class':'wikitable sortable'})
soup_data = soup_table.find_all('tr')
data1=[]
for a in soup_data:
    cols = a. find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data1.append([ele for ele in cols if ele])
data1

data2 = data1[1:]

MAKING PANDAS DATAFRAME WITH DATA COLLECTED 

In [3]:
import numpy as np
import pandas as pd 
df = pd.DataFrame(columns=['Postcode', 'Borough','Neighbourhood'], data=data2)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### cells with  Not assigned borough 

In [4]:
df = df[df.Borough != 'Not assigned']
df.shape

(211, 3)

### combining two rows in order to remove duplicate values 

In [5]:
df= df.groupby('Postcode').agg({'Borough':'first','Neighbourhood':', '.join}).reset_index()
df.head(18)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


###  replacing  'Not assigned'  values from Neighborhood

In [6]:
# changing the value of Not assigned to borouth values
df.loc[df.Neighbourhood =='Not assigned'] = df.Borough

### shape of the dataframe

In [7]:
df.shape

(103, 3)

In [8]:
df1 = pd.read_csv("/media/bhaskar/New Volume/assignments/IBM/Geospatial_Coordinates.csv",header='infer')
#df1["Postal Code"].equals(df['Postcode']) '''for checking the values'''
df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
df1["Postal Code"].equals(df['Postcode'])

False

In [10]:
df2 = pd.merge(df,df1,left_on='Postcode',right_on='Postal Code')
print(df2.shape)


(102, 6)


In [11]:
df2.drop(['Postal Code'],axis=1,inplace= True)
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


 ## Analysis of coordinates

In [12]:

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [18]:
# create a map of toronto using lat and long 
latitude = 43.700110
longitude = -79.416300
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'],df2['Longitude'],df2['Borough'], df2['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto

In [20]:
CLIENT_ID = 'MUS2KI42YI13NBADIUFC0R3D0BSROJ' # your Foursquare ID
CLIENT_SECRET = 'WO551M42KGSP0451C2CTLS3OGNNF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [21]:
df2.loc[0,'Neighbourhood']

'Rouge, Malvern'

In [24]:
neighbourhood_longitude = df2.loc[0,'Longitude']
neighbourhood_latitude = df2.loc[0,'Latitude']
neighbourhood_name = df2.loc[0,'Neighbourhood'].split(",")[0]

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Rouge are 43.806686299999996, -79.19435340000001.


In [28]:
LIMIT = 100
radius = 900
my_url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,CLIENT_SECRET,VERSION,neighbourhood_latitude,neighbourhood_longitude,radius,LIMIT)
#my_url

In [29]:
results = requests.get(my_url).json()
results

{'meta': {'code': 200, 'requestId': '5d7e15e8724750002c6c096f'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4d669cba83865481c948fa53-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/spa_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d1ed941735',
         'name': 'Spa',
         'pluralName': 'Spas',
         'primary': True,
         'shortName': 'Spa'}],
       'id': '4d669cba83865481c948fa53',
       'location': {'address': '8130 Sheppard Ave E',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Morningside Ave',
        'distance': 595,
        'formattedAddress': ['8130 Sheppard Ave E (Morningside Ave)',
         'Toronto ON M1B 3W3',
         'Canada'],
        'labeledLatLngs': [{'label': 'd

In [30]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [31]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Images Salon & Spa,Spa,43.802283,-79.198565
1,Wendy's,Fast Food Restaurant,43.802008,-79.19808
2,Staples Morningside,Paper / Office Supplies Store,43.800285,-79.196607
3,Wendy's,Fast Food Restaurant,43.807448,-79.199056
4,Harvey's,Fast Food Restaurant,43.800106,-79.198258


In [32]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

19 venues were returned by Foursquare.


In [38]:
nearby_venues.groupby('categories').count()

Unnamed: 0_level_0,name,lat,lng
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
African Restaurant,1,1,1
Bakery,1,1,1
Bus Station,1,1,1
Business Service,1,1,1
Chinese Restaurant,1,1,1
Coffee Shop,2,2,2
Fast Food Restaurant,3,3,3
Fruit & Vegetable Store,1,1,1
Greek Restaurant,1,1,1
Gym,1,1,1
