## Creat walkscore csv
1. get walkscores for each state in us_state_abbrev
2. create a dataframe
3. create csv for persistance
4. check which cities have missing data
5. clean city names - remove string within parenthesis, separate hyphenated cities
6. create a walk, bike, and transit score column based on walkscore.com's scale
7. apply functions and update csv

In [1]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


### Import libraries

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 700)

### 1. Get walkscores for each city,state f

In [3]:
us_state_abbrev = ['AL','AK','AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS',
                   'KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC',
                   'ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']



### Create dataframe

In [4]:
# Empty DataFrame to add to in the loop below
df = pd.DataFrame()

for i in us_state_abbrev:
    df_i = pd.read_html('https://www.walkscore.com/' + i)[0]
    df_i['State'] = i
    df = pd.concat([df, df_i])

### 3. Create csv for persistance

In [5]:
df.to_csv('walk_score.csv')

In [6]:
df = pd.read_csv('walk_score.csv')
print(df.shape)
df.head()

(2500, 8)


Unnamed: 0.1,Unnamed: 0,City,Zip Code,Walk Score,Transit Score,Bike Score,Population,State
0,0,Birmingham (the largest city in Alabama),35211.0,35,25,31,212237,AL
1,1,Montgomery,36109.0,26,16,38,205764,AL
2,2,Mobile,36605.0,32,--,39,195111,AL
3,3,Huntsville,35810.0,24,13,40,180105,AL
4,4,Tuscaloosa,,33,--,37,90468,AL


### 4. Check which cities have missing data

In [7]:
df.isnull().sum()

Unnamed: 0          0
City                0
Zip Code         1489
Walk Score          0
Transit Score       0
Bike Score          0
Population          0
State               0
dtype: int64

In [8]:
df[['Walk Score', 'Transit Score', 'Bike Score']] = df[['Walk Score', 'Transit Score', 'Bike Score']].replace('--', 0)

In [9]:
df[['Walk Score', 'Transit Score', 'Bike Score']] = df[['Walk Score', 'Transit Score', 'Bike Score']].astype(int)

### 5. Clean city names - remove string within parentheis, separate hyphenated cities.

In [None]:
df['City'] = df['City'].str.replace(r"\(.*\)","")

In [None]:
df.loc[df['City'].str.contains('-'), 'City'] = df.loc[df['City'].str.contains('-'), 'City'].str.split('-')
df = df.explode('City') 
df.loc[:,'City'] = df.loc[:,'City'].str.strip()

### 6. Create a walk, bike, and transit score column based on walkscore.com's scale

[Walk Score]("Personal/labspt15-cityspire-g-ds/notebooks/datasets/data/walk_score/walk_breakdown.png")


In [None]:
def walk_score(row):
  if 0 <= row['Walk Score'] <= 49:
    val = 'Car Dependent'
  elif 50 <= row['Walk Score'] <= 69:
    val = 'Somewhat Walkable'
  elif 101 <= row['Walk Score'] <= 150:
    val = 'Very Walkable'
  else:
    val = "Walker's Paradise"

  return val

In [None]:
df['Walkability'] = df.apply(walk_score, axis=1)

In [None]:
def transit_score(row):
    if 1 <= row['Transit Score'] <= 24:
        val = 'Minimal'
    elif 25 <= row['Transit Score'] <= 49:
        val = 'Some Transit'
    elif 50 <= row['Transit Score'] <= 69:
        val = 'Good Transit'
    elif 101 <= row['Transit Score'] <= 150:
        val = 'Excellent Transit'
    else:
        val = "Rider's Paradise"

    return val

In [None]:
df['Public Transportation'] = df.apply(transit_score, axis=1)

In [None]:
def bike_score(row):
  if 0 <= row['Bike Score'] <= 49:
    val = 'Somewhat Bikeable'
  elif 50 <= row['Bike Score'] <= 69:
    val = 'Bikeable'
  elif 101 <= row['Bike Score'] <= 150:
    val = 'Very Bikeable'
  else:
    val = "Biker's Paradise"

  return val

In [None]:
df['Bikeability'] = df.apply(bike_score, axis=1)


### 7. Update csv

In [None]:
df.head()

In [None]:
df[df.duplicated(subset=['City', 'Zip', 'Walk Score', 'Transit Score', 'Bike Score' 'State'], keep=False)]

In [None]:
df.to_csv('walk_score.csv')