# This is the Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## Part 1: Create a dataframe using data from a Wikipedia page

First, import the libraries and repositories used in this part

In [1]:
# import libraries and packages
import pandas as pd
import numpy as np
import requests
import lxml.html as lh
from IPython.display import display, HTML

Then, using the Wikipedia url provided in the assignment.
The contents of the site are stored, then parsed the HTML code, storing tr elements in a list as the table should be contained in these lines

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Create a handle to the contents of the website
page = requests.get(url)

#Store the contents of the website under doc
doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')
tr_elements[0:10]

[<Element tr at 0x7fb818355c28>,
 <Element tr at 0x7fb818355bd8>,
 <Element tr at 0x7fb818355c78>,
 <Element tr at 0x7fb818355cc8>,
 <Element tr at 0x7fb818355ef8>,
 <Element tr at 0x7fb818355f48>,
 <Element tr at 0x7fb818355f98>,
 <Element tr at 0x7fb818376048>,
 <Element tr at 0x7fb818376098>,
 <Element tr at 0x7fb8183760e8>]

Corroboration that the each elements has 3 columns, as observed in the Wikipedia website

In [3]:
row_length = [len(T) for T in tr_elements[:10]]
print(row_length)

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


Another verification corroborating the headers

In [4]:
tr_elements = doc.xpath('//tr')
# Create an empty list
column=[]
i=0
# For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d: "%s"'%(i,name))
    column.append((name,[]))

1: "Postal Code
"
2: "Borough
"
3: "Neighbourhood
"


Since first row is the header, the data will be stored starting from the second row

In [5]:
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=row_length[0]:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data = t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        column[i][1].append(data)
        #Increment i for the next column
        i+=1

Another check, now on the initial number of rows for each column

In [6]:
[len(C) for (title,C) in column]

[181, 181, 181]

The data is first stored in a dictionary, then it is turned into a Dataframe

In [34]:
Dict = {title:column for (title,column) in column}
df = pd.DataFrame(Dict)

**This is the cleaning steps:**
* Get rid of the \n symbols
* Drop rows in Borough column containig 'Not assigned'
* Replace elements in Neighbourhood column containing 'Not assigned' 
  with Borough values from the same row

In [35]:
# Clean the data
# Get rid of the \n symbols 
df = df.replace('\n','', regex=True)
df.columns = df.columns.str.replace('\n','', regex=True)

# Drop rows with Borough == 'Not assigned'
df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)
nan_value = np.nan
df.replace('', nan_value, inplace=True)
df = df.dropna(how='any')

# Replace cell in Neighbourhood == 'Not assigned' with Borough values from the same row
condition = df.Neighbourhood == 'Not assigned'
df.Neighbourhood[condition] = df.Borough
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Final assesment of the remaining number of rows in the dataframe

In [9]:
print('The number of rows in the dataframe is: {}'.format(df.shape[0]))

The number of rows in the dataframe is: 103


## Part 2: Assign Latitudes and Longitudes
Download the Geospatial Data csv file and convert it into a dataframe

In [32]:
url = 'https://cocl.us/Geospatial_data'
    
df_geo = pd.read_csv(url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Assign to df Latitude and Longitude values taking de Postal Code, find the match in df_geo and retrive the data from there

In [51]:
df['Latitude'] = ''
df['Longitude'] = ''
for idx,postal_code in df['Postal Code'].iteritems():
    index=(df_geo[df_geo['Postal Code']==postal_code].index)
    lat_long = df_geo[['Latitude', 'Longitude']].loc[index]
    lat = lat_long.values.tolist()[0][0]
    long = lat_long.values.tolist()[0][1]
    df['Latitude'].loc[idx] = lat
    df['Longitude'].loc[idx] = long
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.7533,-79.3297
3,M4A,North York,Victoria Village,43.7259,-79.3156
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895


## Part 3: Explore and cluster the neighborhoods in Toronto