# Lets first install the required packages (for the part 1,2 and 3) and import the libraries

In [2]:
# - Installing packages

#!conda install -c conda-forge folium=0.5.0 --yes # download of folium

#!conda install -c conda-forge beautifulsoup4 --yes # download beautiful soup

#!conda install -c conda-forge geopy --yes # download geopy

#!pip install lxml
#!pip install et_xmlfile
#---------------------------------------------------------------------------------#

#- Importing the libraries
import pandas as pd
import numpy as np
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup as bs
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium #


#important resources 


 #1- https://medium.com/analytics-vidhya/web-scraping-wiki-tables-using-beautifulsoup-and-python-6b9ea26d8722 
 #2- https://stackoverflow.com/questions/41720896/beautifulsoup-parsing-html-get-part-of-href
 #3- http://beautiful-soup-4.readthedocs.io/en/latest/


#### To create the above dataframe:

1.The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

2.Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

3.More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

4.If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

5.Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.

6.In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.


## Getting and scrapping the content

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url)
soup = bs(source.text, "html.parser")
postal_table = soup.find(class_="wikitable sortable") 

### Into Panda dataframe

In [4]:
#question 1

table_rows = postal_table.find_all('tr')
row_values = []
for tr in table_rows:
    td = tr.find_all('td')
    row_text = [tr.text.strip() for tr in td if tr.text.strip()]
    if row_text:
        row_values.append(row_text)

toronto_data = pd.DataFrame(row_values, columns=["PostalCode", "Borough", "Neighborhood"]) #data frame consisting in PostalCode, Borough, and Neighborhood
toronto_data.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Working on the data frame 

In [5]:
#Question 2,3,4,5

borough = toronto_data[toronto_data.Borough != 'Not assigned'] # ignoring 'not assigned' 

borough['Neighborhood'].replace('Not assigned', "Borough", inplace=True) #replacing 'Not assigned' neighborhood value with the corresponding Borough value

combined_data = borough.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index(drop=False) # combining neighbours

combined_data.head(15)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
#question 6
combined_data.shape  #shape

(103, 3)

## Adding coordinates

In [7]:
url = 'http://cocl.us/Geospatial_data' #reading the csv file
coordinates = pd.read_csv(url)

In [8]:
combined_data2 = pd.DataFrame({'PostalCode':combined_data['PostalCode'],  #adding coordinates
                            'Borough':combined_data['Borough'], 
                            'Neighborhood':combined_data['Neighborhood'], 
                            'Latitude':coordinates['Latitude'], 
                            'Longitude':coordinates['Longitude']})

Toronto_df= combined_data2
Toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
