In [1]:
import pandas as panda
import numpy as np
from geopy.geocoders import Nominatim
from collections import defaultdict
from geopy.exc import GeocoderTimedOut
import time

In [2]:
culture_center_data_path = 'MSFC_44 Wards_Complete_Final.xlsx'
culture_data = panda.read_excel(culture_center_data_path)
culture_data.shape

(1397, 16)

In [3]:
culture_data.head()

Unnamed: 0,FACILITY NAME,Full Address,Street #,Street Name,Suite,City,Province,Postal Code,Ward,Performance,Exhibition / Visual Arts,Screen Based,Library,Multipurpose,Heritage,OWNERSHIP
0,Thistletown CC,"925 Albion Road, Toronto, ON, M9V 1A6",925,Albion Road,,Toronto,ON,M9V 1A6,1,1.0,,,,1.0,,City Operated
1,Albion Pool & Health Club,"1485 Albion Road, Toronto, ON, M9V 1B2",1485,Albion Road,,Toronto,ON,M9V 1B2,1,,,,,1.0,,City Owned
2,Albion Branch (TPL),"1515 Albion Road, Toronto, ON, M9V 1B2",1515,Albion Road,,Toronto,ON,M9V 1B2,1,1.0,,,1.0,1.0,,City Operated
3,Theatre Francais de Toronto - Centre for Creation,"21 College Street, Office 610, Toronto, ON, M5...",21,College Street,Office 610,Toronto,ON,M5G 2B3,1,1.0,,,,,1.0,Owned by others
4,Humber Arboretum Gardens,"203 Humber College Boulevard, Toronto, ON, M9W...",203,Humber College Boulevard,,Toronto,ON,M9W 5L7,1,,,,,1.0,,City Owned


#### Observations so far about the data given:

1. Total of 1397 culture centers data is provided with 16 different fields of data


2. There are no numeric values denoting either count /seating capacity / revenue etc for the centers


3. Obvious categorical value are Performance, Exhibition/Visual Arts, Screen based, Library, Multipurpose,Heritage and Ownership


4. Data cleaning is required for the categorical fields


5. Column street address is an amalgamation of columns street #, street name , province ,city and postal code



#### Next Steps :


1. Rename the columns to useful feature names


2. Check for null values across columns


3. Check if any community center detail has been duplicated


4. Correct NaN values for categorical fields


5. Change categorical field types to integer


6. Get geo latitude and longitude coordinates for each center


In [4]:
culture_data.columns = ['facility',\
                        'full_adress',\
                        'street',\
                        'street_name',\
                        'suite',\
                        'city',\
                        'province',\
                        'postal_code',\
                        'ward',\
                        'performance', \
                        'exhibition', \
                        'screen_based', \
                        'library', \
                        'multi_purpose', \
                        'heritage', \
                        'ownership'
                       ]

In [5]:
culture_data.isnull().any()

facility         False
full_adress      False
street           False
street_name      False
suite             True
city             False
province         False
postal_code      False
ward             False
performance       True
exhibition        True
screen_based      True
library           True
multi_purpose     True
heritage          True
ownership        False
dtype: bool

Suite: according to our data dictionary- Suite or room number of facility if part of a larger facility. 

Values are usually details like which particular wing/floor/unit. Seeing that this specific data is not really useful for the purposes of our analysis we are going to change the value to 1 for part of larger building or 0 as standalone building.

We are going to replace the missing values in this column with a value of 0

In [6]:
culture_data.suite.nunique()

107

In [7]:
culture_data.suite.value_counts().head()

A            9
B            4
2nd Floor    4
Suite 200    3
4th Floor    3
Name: suite, dtype: int64

In [8]:
##replace NaN with 0
culture_data.suite.isnull().sum()

1261

In [9]:
culture_data.suite.fillna(value=0,inplace=True)
culture_data.suite.isnull().sum()

0

In [10]:
culture_data.suite = culture_data.suite.apply(lambda x: 1 if x!=0 else 0)
culture_data.suite.value_counts()

0    1261
1     136
Name: suite, dtype: int64

Lets deal with our next simplest data field ownership. We are going to encode the same using the following rule:

1 for City Owned<br>
2 for City operated <br>
0 for owned by others

In [11]:
culture_data.ownership.value_counts()

Owned by others    1104
City Owned          218
City Operated        54
City operated        16
City owned            4
City owned            1
Name: ownership, dtype: int64

It is clear that there are some duplicate entries for data formatting issues. We will trim and replace all with lowercase

In [12]:
culture_data.ownership= culture_data.ownership.apply(lambda x: x.lower().strip())
culture_data.ownership.value_counts()

owned by others    1104
city owned          223
city operated        70
Name: ownership, dtype: int64

In [13]:
culture_data.ownership = culture_data.ownership.apply(lambda x: 1 if x=='city owned' else (2 if x=='city operated' else 0))
culture_data.ownership.value_counts()

0    1104
1     223
2      70
Name: ownership, dtype: int64

We will deal with our other categorical data which are : Performance based/screen based, etc.<br>
Most of  these values have 1 as True value and NaN as false value<br>
We will replace NaN with 0 and change type of cells to integer

In [14]:
culture_data.performance.fillna(value=0, inplace=True)
culture_data.exhibition.fillna(value=0,inplace =True)
culture_data.screen_based.fillna(value=0,inplace =True)
culture_data.library.fillna(value=0,inplace =True)
culture_data.heritage.fillna(value=0,inplace =True)
culture_data.multi_purpose.fillna(value=0,inplace =True)
culture_data[['performance','exhibition','screen_based','library','heritage','multi_purpose']].isnull().sum()

performance      0
exhibition       0
screen_based     0
library          0
heritage         0
multi_purpose    0
dtype: int64

In [15]:
culture_data.performance=culture_data.performance.astype(np.int64)
culture_data.exhibition=culture_data.exhibition.astype(np.int64)
culture_data.screen_based=culture_data.screen_based.astype(np.int64)
culture_data.library=culture_data.library.astype(np.int64)
culture_data.multi_purpose=culture_data.multi_purpose.astype(np.int64)
culture_data.heritage=culture_data.heritage.astype(np.int64)


culture_data.dtypes

facility         object
full_adress      object
street           object
street_name      object
suite             int64
city             object
province         object
postal_code      object
ward              int64
performance       int64
exhibition        int64
screen_based      int64
library           int64
multi_purpose     int64
heritage          int64
ownership         int64
dtype: object

In [16]:
culture_data[['performance','exhibition','screen_based','library','heritage','multi_purpose']].nunique()

performance      2
exhibition       2
screen_based     2
library          2
heritage         2
multi_purpose    2
dtype: int64

Check for duplicate community center names

In [18]:
culture_data.shape, culture_data.facility.nunique()

((1397, 16), 1394)

In [19]:
t = culture_data.groupby(['facility'])['street'].count().reset_index()

In [21]:
t[t.street>1]

Unnamed: 0,facility,street
523,Guildwood Branch (TPL),2
666,Lawrence Park Community Church,2
668,Learning Enrichment Foundation,2


In [22]:
bad_facilities = t[t.street>1].facility.values.tolist()

We found the offending duplicate facility names, lets look at them in further detail

In [23]:
culture_data[culture_data.facility.isin(bad_facilities)].sort_values(by='facility')

Unnamed: 0,facility,full_adress,street,street_name,suite,city,province,postal_code,ward,performance,exhibition,screen_based,library,multi_purpose,heritage,ownership
782,Guildwood Branch (TPL),"123 Guildwood Parkway, Toronto, ON, M1E 4V2",123,Guildwood Parkway,0,Toronto,ON,M1E 4V2,23,0,0,0,1,0,0,2
1374,Guildwood Branch (TPL),"123 Guildwood Parkway, Toronto, ON, M1E 5G5",123,Guildwood Parkway,0,Toronto,ON,M1E 5G5,43,0,0,0,1,0,0,2
810,Lawrence Park Community Church,"2180 Bayview Avenue, Toronto, ON, M4N 3K7",2180,Bayview Avenue,0,Toronto,ON,M4N 3K7,25,1,0,0,0,0,0,0
829,Lawrence Park Community Church,"2180 Bayview Avenue, Toronto, ON, M4N 3K7",2180,Bayview Avenue,0,Toronto,ON,M4N 3K7,25,1,0,0,0,1,0,0
141,Learning Enrichment Foundation,"1267 Weston Road, Toronto, ON, M6M 4P9",1267,Weston Road,0,Toronto,ON,M6M 4P9,11,0,0,0,0,1,0,0
157,Learning Enrichment Foundation,"116 Industry St., Toronto, ON, M6M 4L8",116,Industry St.,0,Toronto,ON,M6M 4L8,12,0,0,0,0,1,0,0


We found 3 venues which are repeated and this is how we are going to deal with the same:
    
    1. Guildwood branch public library is present at M1E 4V2. Hence the second location is incorrect, we will drop the row at index 1374
    2. Learning Enrichment Foundation and Lawrence Park Community Church are present in different locations. We will retain them as is
    

In [24]:
culture_data.drop(1374, inplace=True)

In [25]:
culture_data.shape

(1396, 16)

At this point , we have a cleaned dataset. We will proceed to add further details in order to analyze/visualize the data further

In [26]:
culture_data.street = culture_data.street.astype(np.object)

In [27]:
culture_data[['street', 'street_name', 'city']].dtypes

street         object
street_name    object
city           object
dtype: object

In [28]:
culture_data['address_for_geo_search']=culture_data.street.map(str) + ' '+ culture_data.street_name+' '+culture_data.city

In [29]:
culture_data.address_for_geo_search.head()

0                 925 Albion Road Toronto
1                1485 Albion Road Toronto
2                1515 Albion Road Toronto
3               21 College Street Toronto
4    203 Humber College Boulevard Toronto
Name: address_for_geo_search, dtype: object

In [72]:
culture_data.address_for_geo_search.isnull().sum()

0

In [31]:
geolocator = Nominatim(user_agent="toronto_explorer")

In [91]:
geo_coordinates = defaultdict(list)
for item in culture_data.address_for_geo_search.values.tolist():
    geolocator = Nominatim(user_agent="toronto_explorer")
    if item not in geo_coordinates.keys():
        
        try:
            location =  geolocator.geocode(item)
        except GeocoderTimedOut as e:
            
            location = geolocator.geocode(item)
        
#         location = geolocator.geocode(item)
        if location:
            latitude = location.latitude
            longitude = location.longitude
            print(item,latitude,longitude)
            geo_coordinates[item].append([latitude,longitude])
        
        time.sleep(2)

        
    
    

925 Albion Road Toronto 43.7354505 -79.5625273607685
1485 Albion Road Toronto 43.7396133 -79.580608
1515 Albion Road Toronto 43.7398714 -79.5848098635851
21 College Street Toronto 43.7343246 -79.5629976
203 Humber College Boulevard Toronto 43.7298449 -79.6037919
2534 Kipling Avenue Toronto 43.7446265 -79.5835749290698
2580 Kipling Avenue Toronto 43.74766815 -79.5861061331086
175 Mount Olive Drive Toronto 43.7497345 -79.5963590455025
21 Panorama Court Toronto 43.7463115 -79.5815470987295
10 Rampart Road Toronto 43.7347935 -79.5886817868421
34 Riverdale Drive Toronto 43.7312497454545 -79.5597202909091
2 Rowntree Road Toronto 43.75251855 -79.5847329273576
33 Carlson Court Toronto 43.69023905 -79.5836481602385
650 Dixon Road Toronto 43.68892305 -79.5778707247553
801 Dixon Road Toronto 43.6869439 -79.5879143
850 Humberwood Boulevard Toronto 43.729175 -79.6192749
2170 Kipling Avenue Toronto 43.7207556 -79.5724442778697
2243 Kipling Avenue Toronto 43.72612675 -79.5732276249261
2239 Lawrence A

2700 Eglinton Avenue West Toronto 43.6899158 -79.4782808028594
130 Industry Street Toronto 43.6947624 -79.4955347
91 Kersdale Avenue Toronto 43.6830912 -79.46834
1565 Lawrence Avenue West Toronto 43.7064334 -79.4857627443117
120 Trowell Avenue Toronto 43.6863048 -79.4723454278506
1651 Keele St. Toronto 43.6833984 -79.4719414
116 Industry St. Toronto 43.6938385 -79.4937089
2562 Eglinton Avenue West Toronto 43.6909673 -79.4721021333333
1700 Keele St. Toronto 43.6845254 -79.4737629
1652 Keele St. Toronto 43.6839931 -79.472602
116 Cornelius Parkway Toronto 43.7182651 -79.4750102
490 Queens Drive Toronto 43.71022235 -79.4961579651792
1507 Lawrence Ave. West Toronto 43.7073942 -79.4824294
2000 Keele St. Toronto 43.69737605 -79.47592735
2801 Eglinton Avenue West Toronto 43.68915 -79.4788949
145 Annette Street Toronto 43.6632845 -79.4663718470716
333 Annette Street Toronto 43.6612108 -79.4740297
16 Baby Point Road Toronto 43.6582056 -79.4896329
1873 Bloor Street West Toronto 43.6609452 -79.430

GeocoderTimedOut: Service timed out

In [93]:
len(geo_coordinates), culture_data.shape

(169, (1396, 17))

In [45]:
# geo_coordinates = defaultdict(list)
for item in culture_data.address_for_geo_search.values.tolist()[240:]:
    geolocator = Nominatim(user_agent="toronto_explorer")
    if item not in geo_coordinates.keys():
        
        try:
            location =  geolocator.geocode(item)
        except GeocoderTimedOut as e:
            
            location = geolocator.geocode(item)
        
#         location = geolocator.geocode(item)
        if location:
            latitude = location.latitude
            longitude = location.longitude
            print(item,latitude,longitude)
            geo_coordinates[item].append([latitude,longitude])
        
        time.sleep(2)

        
    
    

1567 Kingston Road Toronto 43.68985716 -79.26737434
1859 Kingston Road Toronto 43.6956001 -79.259173
3017 Kingston Road Toronto 43.7257716642857 -79.2311780357143
3474 Kingston Road Toronto 43.7384413189722 -79.2178242184327
3555 Kingston Road Toronto 43.7387915 -79.216648
3600 Kingston Road Toronto 43.740158 -79.2167184690647
21 Markanna Drive Toronto 43.7402566 -79.2190141
1 McCowan Road Toronto 43.7806502 -79.2549524
3800 St. Clair Avenue East Toronto 43.72349815 -79.2374134008801
3701 Danforth Avenue Toronto 43.6989997 -79.2570572
1500 Birchmount Road Toronto 43.7578067 -79.2892126
20 Canadian Road Toronto 43.7603726 -79.2962761
1880 Eglinton Avenue East Toronto 43.7262665 -79.3000047
2380 Eglinton Avenue East Toronto 43.7322688 -79.2704268
85 Ellesmere Road Toronto 43.7570385 -79.3122635719098
1299 Ellesmere Road Toronto 43.7692038 -79.2642882
10 Howarth Avenue Toronto 43.7386917 -79.3059839996315
929 Kennedy Road Toronto 43.7434136 -79.2723629
22 Landseer Road Toronto 43.7350504 

In [46]:
len(geo_coordinates), culture_data.shape

(1087, (1396, 17))

In [47]:
tt=panda.DataFrame(geo_coordinates).T.reset_index()
tt.columns=['location','ordinate']
tt

Unnamed: 0,location,ordinate
0,1 Blue Jays Way Toronto,"[43.6416641, -79.3891988236638]"
1,255 Bremner Boulevard Toronto,"[43.6409668, -79.3851702]"
2,292 Brunswick Avenue Toronto,"[43.6653176, -79.4073567]"
3,296 Brunswick Avenue Toronto,"[43.6655707, -79.4074586]"
4,20 Camden Street Toronto,"[43.6472861, -79.396972]"
5,25 Cecil Street Toronto,"[43.656713, -79.395893]"
6,58 Cecil Street Toronto,"[43.6570842, -79.3945442]"
7,19 Charlotte Street Toronto,"[33.5731845510204, -117.735511469388]"
8,83 Christie Street Toronto,"[43.665743475, -79.4190691888889]"
9,155 College Street Toronto,"[43.6591154, -79.3931845]"


In [48]:
tt.to_csv('ordinate_1.csv',index=False)

In [82]:
culture_data.address_for_geo_search.nunique()

1327