In [24]:
import pandas as pd
import numpy as np

## Download and Explore Dataset

In [25]:
canada=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [26]:
#verify that the data have loaded correctly
canada

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 ..          ...               ...   
 175         M5Z      Not assigned   
 176         M6Z      Not assigned   
 177         M7Z      Not assigned   
 178         M8Z         Etobicoke   
 179         M9Z      Not assigned   
 
                                          Neighbourhood  
 0                                         Not assigned  
 1                                         Not assigned  
 2                                            Parkwoods  
 3                                     Victoria Village  
 4                            Regent Park, Harbourfront  
 ..                                                 ...  
 175                                       Not assigned  
 176                                       Not assigned  
 177                

In [27]:
canada[0]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [28]:
canada[0]['Postal Code'][8]

'M9A'

In [29]:
#Now we assign the corresponding values in different arrays
PostalCode= canada[0]['Postal Code'] 
Borough = canada[0]['Borough'] 
Neighborhood= canada[0]['Neighbourhood'] 

In [30]:
Neighborhood

0                                           Not assigned
1                                           Not assigned
2                                              Parkwoods
3                                       Victoria Village
4                              Regent Park, Harbourfront
                             ...                        
175                                         Not assigned
176                                         Not assigned
177                                         Not assigned
178    Mimico NW, The Queensway West, South of Bloor,...
179                                         Not assigned
Name: Neighbourhood, Length: 180, dtype: object

### Create the DataFrame

In [31]:
#now we create the columns of what will be our dataframe
column_names = ['PostalCode','Borough', 'Neighborhood'] 
df_canada = pd.DataFrame(columns=column_names)
df_canada

Unnamed: 0,PostalCode,Borough,Neighborhood


In [32]:
#Now we assign the values saved in each array to the columns of the dataframe
df_canada=pd.DataFrame({'PostalCode':PostalCode, 'Borough':Borough, 'Neighborhood':Neighborhood})
df_canada.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [33]:
#we replace "Not assigned" with  NAN
df_canada.replace("Not assigned", np.nan, inplace = True)
df_canada.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [34]:
# simply drop whole row with NaN in "Borough" column
df_canada.dropna(subset=["Borough"], axis=0, inplace=True)

# reset index, because we droped several rows
df_canada.reset_index(drop=True, inplace=True)

In [35]:
df_canada.head(25)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### verify if there are missing values


In [36]:
missing_data = df_canada.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

PostalCode
False    103
Name: PostalCode, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood
False    103
Name: Neighborhood, dtype: int64



### the number of rows of my dataframe


In [39]:
df_canada.shape

(103, 3)