In [17]:
import pandas as pd
import re
import datetime


# =========== Pre-process ==========
df = pd.read_csv('/Users/linzeyang/Desktop/BIA-660/201606-citibike-tripdata.csv')

# Missing values in columns
print(df.isnull().sum())


# Snake_case the columns
def camel_to_snake(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).replace(' ','').lower()
df.columns = [camel_to_snake(col) for col in df.columns]

# Parse start_time and stop_time
df['start_datetime'] = [datetime.datetime.strptime(x, '%m/%d/%Y %H:%M:%S') for x in df.starttime]
df['start_day'] = [x.strftime('%m/%d/%Y') for x in df.start_datetime]
# The other way
# df['start_day'] = [x[:10] for x in df.start_time]
# print(df.head())

# =========== Group by date and station ===========
df['bikein'] = 1
df['bikeout'] = 1

df_bikeout = df.groupby(by=['startstationid', 'start_day'])['bikeout'].count()
df_bikeout = pd.DataFrame(df_bikeout)
df_out = df_bikeout.reset_index(drop = False)
df_out.rename_axis({'startstationid':'station_id'}, axis='columns', inplace=True)
print(df_out.head())

df_bikein = df.groupby(by=['endstationid', 'start_day'])['bikein'].count()
df_bikein = pd.DataFrame(df_bikein)
df_in = df_bikein.reset_index(drop = False)
df_in.rename_axis({'endstationid':'station_id'}, axis='columns', inplace=True)
print(df_in.head())


# =========== Bike loss per day for each station ===========
# Merge two DFs by station_id
df_dayloss = df_out.merge(right=df_in, left_on=['station_id', 'start_day'], right_on=['station_id', 'start_day'],
                          how='outer')
df_dayloss = df_dayloss.sort_values(['station_id', 'start_day'])
df_dayloss = df_dayloss.fillna(0)
df_dayloss['bike_loss'] = df_dayloss['bikeout'] - df_dayloss['bikein']
print(df_dayloss)

tripduration                    0
starttime                       0
stoptime                        0
start station id                0
start station name              0
start station latitude          0
start station longitude         0
end station id                  0
end station name                0
end station latitude            0
end station longitude           0
bikeid                          0
usertype                        0
birth year                 195206
gender                          0
dtype: int64
   station_id   start_day  bikeout
0          72  06/01/2016      143
1          72  06/02/2016      135
2          72  06/03/2016       90
3          72  06/04/2016      123
4          72  06/05/2016       55
   station_id   start_day  bikein
0          72  06/01/2016     144
1          72  06/02/2016     131
2          72  06/03/2016      91
3          72  06/04/2016     126
4          72  06/05/2016      50
       station_id   start_day  bikeout  bikein  bike_loss
0    

In [69]:
# create weekday column
#wekday_name=pd.to_datetime(df_dayloss.start_day).dt.weekday_name   #name
wekday_index=pd.to_datetime(df_dayloss.start_day).dt.weekday   # index Monday=0, Sunday=6

In [76]:
# Append weekday column to df_dayloss
df_dayloss['wek_index']=df_wekday_index

In [77]:
df_dayloss

Unnamed: 0,station_id,start_day,bikeout,bikein,bike_loss,wek_index
0,72,06/01/2016,143.0,144.0,-1.0,2
1,72,06/02/2016,135.0,131.0,4.0,3
2,72,06/03/2016,90.0,91.0,-1.0,4
3,72,06/04/2016,123.0,126.0,-3.0,5
4,72,06/05/2016,55.0,50.0,5.0,6
5,72,06/06/2016,126.0,127.0,-1.0,0
6,72,06/07/2016,128.0,126.0,2.0,1
7,72,06/08/2016,83.0,94.0,-11.0,2
8,72,06/09/2016,122.0,119.0,3.0,3
9,72,06/10/2016,158.0,145.0,13.0,4


In [86]:
# count docks
from six.moves import urllib

import json
import collections
import itertools
jsonurl = urllib.request.urlopen('https://gbfs.citibikenyc.com/gbfs/en/station_status.json')
text = json.loads(jsonurl.read()) # <-- read from it
data=text['data']
#print(len(data['stations']))   664 stations
#print(data['stations'][0]['station_id']) # station id

# station id Extract:
total_stations_id = []
for i in range(len(data['stations'])):
    id=data['stations'][i]['station_id']
    total_stations_id.append(id)
total_stations_id=[int(x) for x in total_stations_id]
#print('station_id list:'+str(total_stations_id))
# num_bikes_available Extract:
total_num_bikes_available = []
for i in range(len(data['stations'])):
    num_bikes_available = data['stations'][i]['num_bikes_available']
    total_num_bikes_available.append(num_bikes_available)
#print('bikes availble:'+str(total_num_bikes_available))
# num_docks_available Extract:
total_docks_available = []
for i in range(len(data['stations'])):
    docks_availble=data['stations'][i]['num_docks_available']
    total_docks_available.append(docks_availble)
#print('docks availble:'+str(total_docks_available))
total_docks=[x + y for x, y in zip(total_num_bikes_available, total_docks_available)]
#print('total docks for each station:'+str(total_docks))
#to_dict
new_dict = dict(zip(total_stations_id,total_docks))
print('zip station_id and total docks:'+str(new_dict))


zip station_id and total docks:{72: 38, 79: 33, 82: 0, 83: 60, 116: 38, 119: 19, 120: 19, 127: 27, 128: 25, 137: 0, 143: 22, 144: 19, 146: 39, 147: 0, 150: 29, 151: 29, 152: 27, 153: 53, 157: 22, 161: 33, 164: 47, 167: 44, 168: 47, 173: 49, 174: 30, 195: 44, 212: 26, 216: 23, 217: 38, 223: 33, 224: 0, 225: 37, 228: 52, 229: 21, 232: 22, 236: 39, 237: 37, 238: 28, 239: 29, 241: 20, 242: 20, 243: 29, 244: 31, 245: 23, 247: 20, 248: 23, 249: 27, 251: 27, 252: 32, 253: 54, 254: 30, 257: 38, 258: 23, 259: 38, 260: 35, 261: 26, 262: 23, 264: 27, 265: 32, 266: 24, 267: 55, 268: 24, 270: 22, 274: 29, 275: 19, 276: 25, 278: 19, 279: 36, 280: 31, 281: 58, 282: 26, 284: 38, 285: 0, 289: 19, 291: 18, 293: 51, 295: 24, 296: 35, 297: 27, 301: 34, 302: 22, 303: 30, 304: 33, 305: 30, 306: 36, 307: 28, 308: 26, 309: 40, 310: 36, 311: 29, 312: 30, 313: 23, 314: 39, 315: 28, 316: 43, 317: 27, 319: 28, 320: 37, 321: 26, 322: 3, 323: 38, 324: 49, 325: 32, 326: 23, 327: 36, 328: 21, 330: 39, 331: 24, 332: 2

In [95]:
# append docks to df_dayloss
df_dayloss['docks'] = df_dayloss['station_id'].map(new_dict)

In [96]:
df_dayloss

Unnamed: 0,station_id,start_day,bikeout,bikein,bike_loss,wek_index,docks
0,72,06/01/2016,143.0,144.0,-1.0,2,38.0
1,72,06/02/2016,135.0,131.0,4.0,3,38.0
2,72,06/03/2016,90.0,91.0,-1.0,4,38.0
3,72,06/04/2016,123.0,126.0,-3.0,5,38.0
4,72,06/05/2016,55.0,50.0,5.0,6,38.0
5,72,06/06/2016,126.0,127.0,-1.0,0,38.0
6,72,06/07/2016,128.0,126.0,2.0,1,38.0
7,72,06/08/2016,83.0,94.0,-11.0,2,38.0
8,72,06/09/2016,122.0,119.0,3.0,3,38.0
9,72,06/10/2016,158.0,145.0,13.0,4,38.0


In [124]:
# haven't complete
from selenium import webdriver
driver = webdriver.Chrome('/Users/linzeyang/Desktop/BIA-660/chromedriver')
driver.get('https://www.wunderground.com/history/airport/KNYC/2016/6/1/MonthlyHistory.html?req_city=New+York&req_state=NY&req_statename=&reqdb.zip=10001&reqdb.magic=11&reqdb.wmo=99999')
to_input = driver.find_elements_by_xpath("//div[@id='content-wrap']/div[@id='inner-wrap']")
driver.close()

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=57.0.2987.133)
  (Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Mac OS X 10.11.6 x86_64)


In [122]:
len(to_input)

1