In [1]:
import pandas as pd

streets_df_data = pd.read_csv("../data/cleaned_data.csv")

## Separating labels and the data

In [2]:
# creating labels for future ML

labels = streets_df_data[["shaded_length", "shaded_percent"]]
streets_df_data = streets_df_data.drop(["shaded_length", "shaded_percent"], axis=1)
labels.head()

Unnamed: 0,shaded_length,shaded_percent
0,0.0,0.0
1,0.0,0.0
2,0.000192,0.19462
3,0.0,0.0
4,0.0,0.0


In [4]:
streets_df_data

Unnamed: 0,highway,access,cycleway,bicycle,foot,surface,oneway,lit,maxspeed,hgv,lanes,sidewalk,service,trolley_wire,length,geometry,center,x,y
0,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,,basic,no,0.000851,"LINESTRING (-122.4164837 37.7990541, -122.4173...",POINT (-122.41690585 37.79900125),-122.416906,37.799001
1,motorway,yes,shared_lane,no,,concrete,yes,,55 mph,designated,5.0,,basic,no,0.002131,"LINESTRING (-122.3988926 37.7168917, -122.3989...",POINT (-122.3991750132498 37.71791905462991),-122.399175,37.717919
2,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,,basic,no,0.000986,"LINESTRING (-122.4001293 37.7742804, -122.4001...",POINT (-122.3998092949861 37.77393927964267),-122.399809,37.773939
3,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,,basic,no,0.005475,"LINESTRING (-122.4001293 37.7742804, -122.3994...",POINT (-122.3979785890334 37.77597399396392),-122.397979,37.775974
4,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,,basic,no,0.001665,"LINESTRING (-122.3911048 37.7696737, -122.3912...",POINT (-122.391935931702 37.76962989985856),-122.391936,37.769630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21587,secondary,yes,shared_lane,,,asphalt,yes,,25 mph,,1.0,single,basic,no,0.001157,"LINESTRING (-122.464675 37.732523, -122.465478...",POINT (-122.4651441675435 37.73286141542141),-122.465144,37.732861
21588,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,both,basic,no,0.000083,"LINESTRING (-122.464675 37.732523, -122.464737...",POINT (-122.46470635 37.73249595),-122.464706,37.732496
21589,residential,yes,shared_lane,,,asphalt,yes,,25 mph,,2.0,,basic,no,0.000506,"LINESTRING (-122.4443464 37.7586049, -122.4444...",POINT (-122.4445567814889 37.75873903452056),-122.444557,37.758739
21590,tertiary,yes,track,designated,,asphalt,no,yes,25 mph,,4.0,both,basic,yes,0.000394,"LINESTRING (-122.4059618 37.7823047, -122.4056...",POINT (-122.4058077 37.7821821),-122.405808,37.782182


In [5]:
# dropping geometry related columns which has no use in ML

streets_df_data.drop(["center", "geometry"], inplace=True, axis=1)
streets_df_data.head()

Unnamed: 0,highway,access,cycleway,bicycle,foot,surface,oneway,lit,maxspeed,hgv,lanes,sidewalk,service,trolley_wire,length,x,y
0,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,,basic,no,0.000851,-122.416906,37.799001
1,motorway,yes,shared_lane,no,,concrete,yes,,55 mph,designated,5.0,,basic,no,0.002131,-122.399175,37.717919
2,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,,basic,no,0.000986,-122.399809,37.773939
3,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,,basic,no,0.005475,-122.397979,37.775974
4,residential,yes,shared_lane,,,asphalt,no,,25 mph,,2.0,,basic,no,0.001665,-122.391936,37.76963


## Getting dummies (not getting dum)

In [6]:
# get dummies on every string column

categorized_data = []
for col in streets_df_data:
    # consider only string values
    if streets_df_data[col].dtype != object:
        categorized_data.append(streets_df_data[col].to_frame())
        continue
    categorized_data.append(pd.get_dummies(streets_df_data[col]))

learning_df = pd.concat(categorized_data, axis=1)

## Normalizing data

In [7]:
learning_df = (learning_df-learning_df.min())/(learning_df.max()-learning_df.min())
learning_df.head()

Unnamed: 0,cycleway,living_street,motorway,motorway_link,path,primary,primary_link,residential,secondary,secondary_link,...,busway,drive-through,driveway,emergency_access,parking_aisle,no,yes,length,x,y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.021978,0.527629,0.61232
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.055314,0.624064,0.118376
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0255,0.620614,0.459645
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.142379,0.630571,0.47204
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.043169,0.663437,0.433393


## Saving retrieved labels and data

In [8]:
learning_df.to_csv("../data/ready_to_ML_data.csv", index=False)
labels.to_csv("../data/labels.csv", index=False)