# Worksheet 07 - JSON

Your Name:  Hyeong-gi Hong              
Your Class:   INST 447  
Your Section: 0101 (MWF) or 0102 (TTh)                        
Your favorite color:  Blue

In [1]:
import pandas as pd
import numpy as np
import json

I have saved the JSON response for each geocode request to the Google API for each Capital Area Bikeshare bike station into a csv file called "cabi_with_geo.csv".

Load the CSV into a Pandas DataFrame called 'stations'. Then parse out the JSON response into a two new columns - one for the latitude and one for the longitude.

Remove the column with the json response in it, so that you have a DataFrame that has the bikestation number, the bikestation address, the bikestation latitude, and the bikestation longitude.

The latitude and longitude are within geometry/location in the JSON.

In [2]:
stations = pd.read_csv('data/cabi_with_geo.csv')

In [3]:
stations.drop(['Unnamed: 0'], axis=1, inplace=True)

In [4]:
# null cell checking
stations.isnull().sum()

number      0
address     0
response    0
dtype: int64

In [5]:
stations['number'] = stations['number'].astype('int64')

In [6]:
# jd = json.loads(stations['response'][0])

In [7]:
# lat = jd['results'][0]['geometry']['location']['lat']
# lng = jd['results'][0]['geometry']['location']['lng']

In [8]:
geo_list = []
for line in stations['response']:
    jd = json.loads(line)
    if jd['status'] == 'OK':
        lat = jd['results'][0]['geometry']['location']['lat']
        lng = jd['results'][0]['geometry']['location']['lng']
    else:
        (lat, lng) = (np.NaN, np.NaN)
    geo_list.append({'lat': lat, 'lng': lng})

In [9]:
geos = pd.DataFrame(geo_list)

In [10]:
stations = pd.concat([stations, geos], axis=1)

In [11]:
stations = stations.dropna()
stations.drop(['response'], axis=1, inplace=True)

In [12]:
stations.head()

Unnamed: 0,number,address,lat,lng
0,31111,10th & U St NW,38.917007,-77.025986
1,31602,Park Rd & Holmead Pl NW,38.930927,-77.030867
2,31106,Calvert St & Woodley Pl NW,38.923468,-77.050365
3,31110,20th St & Florida Ave NW,38.915107,-77.044913
4,31107,Lamont & Mt Pleasant NW,38.931139,-77.038249


In [13]:
stations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 0 to 106
Data columns (total 4 columns):
number     106 non-null int64
address    106 non-null object
lat        106 non-null float64
lng        106 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 3.7+ KB


If we want to map the the start and end of each bike trip, then we need to merge that stations' latitude and longitude with the trip data. Load the trips data from the "2010-q4-cabi-split-stations-columns.csv" into a DataFrame called trips. (This is the CABI data, but after I split the Start station and End station columns.)

Now add columns with the latitude and longitude for the start station.

In [14]:
trips = pd.read_csv('data/2010-q4-cabi-split-stations-columns.csv')

In [15]:
trips['start_station_number'].isnull().sum()

16

In [16]:
trips['end_station_number'].isnull().sum()

16

In [17]:
trips = trips.dropna()

In [18]:
trips['start_station_number'] = trips['start_station_number'].astype('int64')
trips['end_station_number'] = trips['end_station_number'].astype('int64')

In [19]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117676 entries, 0 to 117691
Data columns (total 12 columns):
Unnamed: 0               117676 non-null int64
Duration                 117676 non-null object
Start date               117676 non-null object
End date                 117676 non-null object
Start station            117676 non-null object
End station              117676 non-null object
Bike#                    117676 non-null object
Member Type              117676 non-null object
start_station_address    117676 non-null object
start_station_number     117676 non-null int64
end_station_address      117676 non-null object
end_station_number       117676 non-null int64
dtypes: int64(3), object(9)
memory usage: 7.6+ MB


In [20]:
trips.head()

Unnamed: 0.1,Unnamed: 0,Duration,Start date,End date,Start station,End station,Bike#,Member Type,start_station_address,start_station_number,end_station_address,end_station_number
0,0,14h 26min. 2sec.,12/31/2010 23:49,1/1/2011 14:15,10th & U St NW (31111),10th & U St NW (31111),W00771,Casual,10th & U St NW,31111,10th & U St NW,31111
1,1,0h 8min. 34sec.,12/31/2010 23:37,12/31/2010 23:46,10th & U St NW (31111),14th & R St NW (31202),W01119,Registered,10th & U St NW,31111,14th & R St NW,31202
2,2,0h 12min. 17sec.,12/31/2010 23:27,12/31/2010 23:39,Park Rd & Holmead Pl NW (31602),14th St & Spring Rd NW (31401),W00973,Registered,Park Rd & Holmead Pl NW,31602,14th St & Spring Rd NW,31401
3,3,0h 15min. 53sec.,12/31/2010 23:21,12/31/2010 23:37,Calvert St & Woodley Pl NW (31106),14th St & Spring Rd NW (31401),W00914,Registered,Calvert St & Woodley Pl NW,31106,14th St & Spring Rd NW,31401
4,4,0h 36min. 19sec.,12/31/2010 23:20,12/31/2010 23:56,20th St & Florida Ave NW (31110),Columbus Circle / Union Station (31623),W00859,Casual,20th St & Florida Ave NW,31110,Columbus Circle / Union Station,31623


In [21]:
start_stations = stations.rename(index=str, columns={'number': 'start_station_number', 'address': 'start_station_address', 'lat': 'start_lat', 'lng': 'start_lng'})

In [22]:
trips = trips.merge(start_stations, on=['start_station_number', 'start_station_address'])

Now add columns for the latitude and longitude for the end stations.

In [23]:
end_stations = stations.rename(index=str, columns={'number': 'end_station_number', 'address': 'end_station_address', 'lat': 'end_lat', 'lng': 'end_lng'})

In [24]:
trips = trips.merge(end_stations, on=['end_station_number', 'end_station_address'])

In [25]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117589 entries, 0 to 117588
Data columns (total 16 columns):
Unnamed: 0               117589 non-null int64
Duration                 117589 non-null object
Start date               117589 non-null object
End date                 117589 non-null object
Start station            117589 non-null object
End station              117589 non-null object
Bike#                    117589 non-null object
Member Type              117589 non-null object
start_station_address    117589 non-null object
start_station_number     117589 non-null int64
end_station_address      117589 non-null object
end_station_number       117589 non-null int64
start_lat                117589 non-null float64
start_lng                117589 non-null float64
end_lat                  117589 non-null float64
end_lng                  117589 non-null float64
dtypes: float64(4), int64(3), object(9)
memory usage: 11.2+ MB


In [26]:
trips = trips.sort_values(by=['Unnamed: 0'])

In [28]:
trips.reset_index(drop=True, inplace=True)

In [29]:
trips.head()

Unnamed: 0.1,Unnamed: 0,Duration,Start date,End date,Start station,End station,Bike#,Member Type,start_station_address,start_station_number,end_station_address,end_station_number,start_lat,start_lng,end_lat,end_lng
0,0,14h 26min. 2sec.,12/31/2010 23:49,1/1/2011 14:15,10th & U St NW (31111),10th & U St NW (31111),W00771,Casual,10th & U St NW,31111,10th & U St NW,31111,38.917007,-77.025986,38.917007,-77.025986
1,1,0h 8min. 34sec.,12/31/2010 23:37,12/31/2010 23:46,10th & U St NW (31111),14th & R St NW (31202),W01119,Registered,10th & U St NW,31111,14th & R St NW,31202,38.917007,-77.025986,38.912606,-77.031949
2,2,0h 12min. 17sec.,12/31/2010 23:27,12/31/2010 23:39,Park Rd & Holmead Pl NW (31602),14th St & Spring Rd NW (31401),W00973,Registered,Park Rd & Holmead Pl NW,31602,14th St & Spring Rd NW,31401,38.930927,-77.030867,38.937075,-77.032723
3,3,0h 15min. 53sec.,12/31/2010 23:21,12/31/2010 23:37,Calvert St & Woodley Pl NW (31106),14th St & Spring Rd NW (31401),W00914,Registered,Calvert St & Woodley Pl NW,31106,14th St & Spring Rd NW,31401,38.923468,-77.050365,38.937075,-77.032723
4,4,0h 36min. 19sec.,12/31/2010 23:20,12/31/2010 23:56,20th St & Florida Ave NW (31110),Columbus Circle / Union Station (31623),W00859,Casual,20th St & Florida Ave NW,31110,Columbus Circle / Union Station,31623,38.915107,-77.044913,38.896995,-77.006384


Find another API that responds with JSON and provide a link. What data from it do you think would be interesting? How would you use it?

https://www.kaggle.com/sohier/us-energy-statistics/data

This dataset contains the energy consumption. I am interested in every data in this dataset. I would like to use this dataset to identify which energe source is being mostly used and which period has the highest energy usage. From what I find, there can be addition research for how can we reduce the highest energy consumption if it causes natural pollution.