Importing libraries for classification and data cleaning


In [None]:
import os
import seaborn as sns 
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import matplotlib.pyplot as plt
from sklearn import ensemble, model_selection, metrics 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split , cross_val_score , RandomizedSearchCV
import xgboost as xgb

%pylab inline

Populating the interactive namespace from numpy and matplotlib


Adding and printing the dataframe with the skateparks

In [None]:
df=pd.read_csv('/content/skatepark_NY_data - DPR_Skateparks_001.csv')
df.head()

Unnamed: 0,the_geom,SYSTEM,GISPROPNUM,NAME,Google Rating,BMX_PERMIT,INLINE_SKA,SCOOTER_PE,FEATURES_1,FEATURES_2,FEATURES_3,PARK_SURFA,FEATURE_SU,BOROUGH,SHAPE_area,SHAPE_len
0,MULTIPOLYGON (((-73.91996794995724 40.58884837...,B057-06-SKATEPARK-01,B057,Marine Park Skatepark,4.2,,,,Funbox,Quarter Pipe,Bank Ramp,Concrete,Wood,Brooklyn,6121.923885,337.738978
1,MULTIPOLYGON (((-73.87144701429952 40.86716866...,X002-ZN03-SKATEPARK-01,X002,Bronx Park Skatepark,4.3,,,,Quarter Pipe,Funbox,Bank Ramp,Concrete,Metal,Bronx,6196.325183,319.055415
2,MULTIPOLYGON (((-73.82910897363057 40.82693224...,X183-SKATEPARK-01,X183,Bruckner Playground Skatepark,3.9,,,,Funbox,Quarter Pipe,Bank Ramp,Concrete,Concrete,Bronx,4935.574172,283.91946
3,MULTIPOLYGON (((-73.92460034554938 40.83095313...,X034-SKATEPARK-01,X034,Mullaly Skatepark,4.3,Y,,,Funbox,Half-Pipe,Quarter Pipe,Concrete,Wood,Bronx,16409.6613,540.252203
4,MULTIPOLYGON (((-73.92708768919569 40.82560277...,X348-SKATEPARK-01,X348,River Avenue Skatepark,4.4,Y,Y,Y,Funbox,Handrail,Ledge,Concrete,Concrete,Bronx,16451.68588,529.498909




As we can see there are a lot of categorical values, in fact most of the features are categorical. For simplifying dealing with categoricals we can first copy them in a stand-alone dataframe```



In [None]:
df_cat = df.select_dtypes(include=['object']).copy()
df_cat.columns

Index(['the_geom', 'SYSTEM', 'GISPROPNUM', 'NAME', 'BMX_PERMIT', 'INLINE_SKA',
       'SCOOTER_PE', 'FEATURES_1', 'FEATURES_2', 'FEATURES_3', 'PARK_SURFA',
       'FEATURE_SU', 'BOROUGH'],
      dtype='object')

It's also obvious that a lot of the categorical values lacking data - we'll populate them with the most popular and also logical variant - No data

In [None]:
df_cat[['BMX_PERMIT','INLINE_SKA','SCOOTER_PE']]=df[['BMX_PERMIT','INLINE_SKA','SCOOTER_PE']].replace(np.nan, 'N')
df_cat.head()

Unnamed: 0,the_geom,SYSTEM,GISPROPNUM,NAME,BMX_PERMIT,INLINE_SKA,SCOOTER_PE,FEATURES_1,FEATURES_2,FEATURES_3,PARK_SURFA,FEATURE_SU,BOROUGH
0,MULTIPOLYGON (((-73.91996794995724 40.58884837...,B057-06-SKATEPARK-01,B057,Marine Park Skatepark,N,N,N,Funbox,Quarter Pipe,Bank Ramp,Concrete,Wood,Brooklyn
1,MULTIPOLYGON (((-73.87144701429952 40.86716866...,X002-ZN03-SKATEPARK-01,X002,Bronx Park Skatepark,N,N,N,Quarter Pipe,Funbox,Bank Ramp,Concrete,Metal,Bronx
2,MULTIPOLYGON (((-73.82910897363057 40.82693224...,X183-SKATEPARK-01,X183,Bruckner Playground Skatepark,N,N,N,Funbox,Quarter Pipe,Bank Ramp,Concrete,Concrete,Bronx
3,MULTIPOLYGON (((-73.92460034554938 40.83095313...,X034-SKATEPARK-01,X034,Mullaly Skatepark,Y,N,N,Funbox,Half-Pipe,Quarter Pipe,Concrete,Wood,Bronx
4,MULTIPOLYGON (((-73.92708768919569 40.82560277...,X348-SKATEPARK-01,X348,River Avenue Skatepark,Y,Y,Y,Funbox,Handrail,Ledge,Concrete,Concrete,Bronx


We also don't need all of the categorical columns, so we'll save all the needed columns in the list and filter only columns belonging to that list

In [None]:
cat_columns=['BMX_PERMIT', 'INLINE_SKA',
       'SCOOTER_PE', 'FEATURES_1', 'FEATURES_2', 'FEATURES_3', 'PARK_SURFA',
       'FEATURE_SU', 'BOROUGH']
df_cat=df_cat[cat_columns]
df_cat.head()

Unnamed: 0,BMX_PERMIT,INLINE_SKA,SCOOTER_PE,FEATURES_1,FEATURES_2,FEATURES_3,PARK_SURFA,FEATURE_SU,BOROUGH
0,N,N,N,Funbox,Quarter Pipe,Bank Ramp,Concrete,Wood,Brooklyn
1,N,N,N,Quarter Pipe,Funbox,Bank Ramp,Concrete,Metal,Bronx
2,N,N,N,Funbox,Quarter Pipe,Bank Ramp,Concrete,Concrete,Bronx
3,Y,N,N,Funbox,Half-Pipe,Quarter Pipe,Concrete,Wood,Bronx
4,Y,Y,Y,Funbox,Handrail,Ledge,Concrete,Concrete,Bronx


So now we have a good set of categorical columns with all data present everywhere. However, categorical values won't work in the model, so we'll need to transform them to numerical values. For that I'm going to use LabelEncoder module of preprocessing library

In [None]:
df_encoded=df_cat.apply(preprocessing.LabelEncoder().fit_transform)

Ok, so it seems that we have quite a good and clean set of Categorical type columns now. That's great! Now lets deal with the rest of the columns. But before we proceed I noticed an interesting thing that I wanted to action - "GEOM" column contains lattituds and lognitudes values that could be beneficial for features engineering. We need to process and save them in a stand alone list - in order to add to a resulting dataframe later 

In [None]:
import re

lattitudes=[]
longitudes=[]
for x in df['the_geom']:
  text=x.replace('MULTIPOLYGON (((','MULTIPOLYGON')
  MATCH = re.search('MULTIPOLYGON(.+?), -', text)
  if MATCH:
    result_string = MATCH.group(1)
    lat=(result_string.rpartition(' ')[0])
    lon=(result_string.rpartition(' ')[2])
    lattitudes.append(float(lat[:7]))
    longitudes.append(float(lon[:6]))


Adding new geo location columns to the dataframe

In [None]:
df['lattitudes']=lattitudes
df['longitudes']=longitudes

All right, now let's have a look at the rest of the columns. I shortlisted attributes we really need and made a sub-dataframe of them which I called df_non_cat

In [None]:
df_non_cat=df[['NAME','Google Rating','SHAPE_area','SHAPE_len','lattitudes','longitudes']]

Now let's concatenate categorical and non-cat dataframe to get the resulting dataset

In [None]:
result_df=pd.concat([df_non_cat, df_encoded], axis=1, join='inner')


Voila! Now we have a clean and ready to process dataframe that we can use for a ML experiment


In [None]:
result_df.head()

Unnamed: 0,NAME,Google Rating,SHAPE_area,SHAPE_len,lattitudes,longitudes,BMX_PERMIT,INLINE_SKA,SCOOTER_PE,FEATURES_1,FEATURES_2,FEATURES_3,PARK_SURFA,FEATURE_SU,BOROUGH
0,Marine Park Skatepark,4.2,6121.923885,337.738978,-73.919,40.588,0,0,0,1,5,0,1,2,1
1,Bronx Park Skatepark,4.3,6196.325183,319.055415,-73.871,40.867,0,0,0,5,1,0,1,1,0
2,Bruckner Playground Skatepark,3.9,4935.574172,283.91946,-73.829,40.826,0,0,0,1,5,0,1,0,0
3,Mullaly Skatepark,4.3,16409.6613,540.252203,-73.924,40.83,1,0,0,1,2,5,1,2,0
4,River Avenue Skatepark,4.4,16451.68588,529.498909,-73.927,40.825,1,1,1,1,3,4,1,0,0
