In [2]:
from sqlalchemy import create_engine
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
engine = create_engine("sqlite:///data/mta_turnstile.db")
all_tables = engine.table_names()
all_tables
some_tables = all_tables[0:3]
some_tables

['turnstile_2019_0105', 'turnstile_2019_0112', 'turnstile_2019_0119']

In [4]:
dfs = []
for table in some_tables:
    dfs.append(pd.read_sql(table,engine))
    mtadf = pd.concat(dfs)
mtadf.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,03:00:00,REGULAR,6889287,2335920
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,07:00:00,REGULAR,6889299,2335936
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,11:00:00,REGULAR,6889364,2336038
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,15:00:00,REGULAR,6889605,2336101
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,19:00:00,REGULAR,6889966,2336173


'The official count of stations is 472; however, this tabulation classifies some transfer stations as two or more stations, which are called "station complexes" within the nomenclature of the New York City Subway. 

If station complexes are counted as one station each, the number of stations is 424. 
Thirty-two such station complexes exist. 
The reason for the higher count generally lies in the history of the New York City Subway: IRT, BMT and IND stations are usually counted separately, particularly if their lines are not parallel and are adjacent to or on another level to each other.'

The New York City Subway has several types of transfer stations, among them station complexes (i.e. sets of two or more stations connected with a passageway inside fare control) and stations serving two or more lines (considered to be one station each)

**In my turnstile database, I have 377 unique stations**

Another Issue:
    
Many stations share the same name. These stations are disambiguated by the line each of them is on. 

In [5]:
# read in locations DF
locationsdf = pd.read_csv('./data/stations_info.csv')

# 379 unique stop names
locationsdf['Stop Name'].nunique()

379

In [6]:
locationsdf.columns

Index(['Station ID', 'Complex ID', 'GTFS Stop ID', 'Division', 'Line',
       'Stop Name', 'Borough', 'Daytime Routes', 'Structure', 'GTFS Latitude',
       'GTFS Longitude', 'North Direction Label', 'South Direction Label',
       'ADA', 'ADA Notes'],
      dtype='object')

In [7]:
locationsdf.sort_values(['Stop Name']).head(10)

# change column names
columns = {'Station ID': 'STATION_ID',
          'Complex ID': 'COMPLEX_ID',
           'Stop Name': 'STOP_NAME',
          'Daytime Routes':'LINE',
          'Borough':'BOROUGH'}
locationsdf.rename(mapper=columns,axis=1,inplace=True)

In [8]:
# drop columns we don't plan on using
locationsdf.drop(['Division','Line','Structure','North Direction Label','South Direction Label','ADA','ADA Notes'],axis=1,inplace=True)

In [9]:
# drop Staten Island
locationsdf = locationsdf[(locationsdf['BOROUGH'] != 'SI')]

In [10]:
# format STOP_NAME as all caps
locationsdf['STOP_NAME'] = locationsdf['STOP_NAME'].str.upper()
locationsdf.sort_values('STOP_NAME',inplace=True)
locationsdf

Unnamed: 0,STATION_ID,COMPLEX_ID,GTFS Stop ID,STOP_NAME,BOROUGH,LINE,GTFS Latitude,GTFS Longitude
118,119,119,L06,1 AV,M,L,40.730953,-73.981628
309,309,309,119,103 ST,M,1,40.799446,-73.968379
395,395,395,624,103 ST,M,6,40.790600,-73.947478
155,156,156,A18,103 ST,M,B C,40.796092,-73.961454
450,450,450,706,103 ST-CORONA PLAZA,Q,7,40.749865,-73.862700
...,...,...,...,...,...,...,...,...
456,456,456,712,WOODSIDE-61 ST,Q,7,40.745630,-73.902984
171,171,624,E01,WORLD TRADE CENTER,M,E,40.712582,-74.009781
328,328,328,138,WTC CORTLANDT,M,1,40.711835,-74.012188
235,235,235,F18,YORK ST,Bk,F,40.701397,-73.986751


In [11]:
# read in Complex DF
mtacomplexdf = pd.read_csv('./data/stations_complexes_mta.csv')

In [12]:
# drop columns we don't plan to use
mtacomplexdf.drop(['ADA','ADA Notes'],axis=1,inplace=True)

In [13]:
# rename columns
col_map = {'Complex ID':'COMPLEX_ID',
          'Complex Name':'COMPLEX_NAME'}
mtacomplexdf.rename(mapper=col_map,axis=1,inplace=True)

In [14]:
# read in Remote Booth DF
remoteboothdf = pd.read_csv('./data/remote_booth_station_mta.csv')
remoteboothdf.columns

Index(['Remote', 'Booth', 'Station', 'Line Name', 'Division'], dtype='object')

In [15]:
# rename columns
col_map2 = {'Remote':'REMOTE',
          'Booth':'BOOTH',
           'Station':'STATION',
           'Line Name': 'LINE_RBS',
           'Division':'DIVISION'}
remoteboothdf.rename(mapper=col_map2,axis=1,inplace=True)

In [16]:
#check
remoteboothdf.sort_values('STATION',inplace=True)

In [17]:
# can we merge from locations on STATION/STOP NAME


In [18]:
# merge complex DF with the locations DF -- we just want more details for the stations that comprise of a complex
# complexdf = mtacomplexdf.merge(locationsdf,how='left',on='COMPLEX_ID')
# complexdf.COMPLEX_NAME.fillna(complexdf.STOP_NAME,inplace=True)

In [19]:
# format Stop names and complex names
# complexdf['STOP_NAME'] = complexdf['STOP_NAME'].str.upper()
# complexdf['COMPLEX_NAME'] = complexdf['COMPLEX_NAME'].str.upper()

In [20]:
# check
# complexdf

In [21]:
# rearrange order of columns
col_names = ['STATION_ID','COMPLEX_ID','COMPLEX_NAME','STOP_NAME','BOROUGH','LINE','GTFS Stop ID', 'GTFS Latitude','GTFS Longitude']
complexdf = complexdf.reindex(columns=col_names)
complexdf.sort_values(['COMPLEX_ID'],inplace=True)
complexdf.shape

#80 rows

NameError: name 'complexdf' is not defined

In [22]:
wikicomplexdf = pd.read_csv('./data/stations_complexes_wiki.csv')
wikicomplexdf
# mine basically matches up with the mta data, so I will use what mta has provided

Unnamed: 0,COMPLEX_NAME,STATION_NAME,LINE,BOROUGH
0,14 St / 6 Av,14 St,123,Manhattan
1,14 St / 6 Av,14 St,FM,Manhattan
2,14 St / 6 Av,6 Av,L,Manhattan
3,14 St / 8 Av,14 St,ACE,Manhattan
4,14 St / 8 Av,8 Av,L,Manhattan
...,...,...,...,...
76,Times Sq - 42 St / Port Authority Bus Terminal,Times Sq,S,Manhattan
77,Times Sq - 42 St / Port Authority Bus Terminal,Times Sq,NQRW,Manhattan
78,Times Sq - 42 St / Port Authority Bus Terminal,Times Sq - 42 St,123,Manhattan
79,Times Sq - 42 St / Port Authority Bus Terminal,Times Sq,7,Manhattan


In [287]:
#50 unique stops
complexdf.STOP_NAME.nunique()

# 32 unique complex names
complexdf.COMPLEX_NAME.nunique()

complexdf['COMPLEX_NAME'] = complexdf.COMPLEX_NAME.apply(lambda x: x.replace(' - ','-'))
complexdf['COMPLEX_NAME'] = complexdf.COMPLEX_NAME.apply(lambda x: x.replace(' / ','/'))
complex_u = list(complexdf.COMPLEX_NAME.unique())
stop_u = list(complexdf.STOP_NAME.unique())

In [288]:
complexmta = mtadf[mtadf['STATION'].isin(complex_u)]
complexmta.STATION.value_counts()
stationmta = mtadf[mtadf['STATION'].isin(stop_u)]
stationmta.STATION.nunique()
# 27 of 32 complexes

27

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,03:00:00,REGULAR,6889287,2335920
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,07:00:00,REGULAR,6889299,2335936
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,11:00:00,REGULAR,6889364,2336038
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,15:00:00,REGULAR,6889605,2336101
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,19:00:00,REGULAR,6889966,2336173
...,...,...,...,...,...,...,...,...,...,...,...
197396,R622,R123,00-00-07,FRANKLIN AV,2345S,IRT,01/18/2019,07:00:00,REGULAR,10561407,21877674
197397,R622,R123,00-00-07,FRANKLIN AV,2345S,IRT,01/18/2019,11:00:00,REGULAR,10561852,21877934
197398,R622,R123,00-00-07,FRANKLIN AV,2345S,IRT,01/18/2019,15:00:00,REGULAR,10562162,21878207
197399,R622,R123,00-00-07,FRANKLIN AV,2345S,IRT,01/18/2019,19:00:00,REGULAR,10562504,21878988


In [289]:
#remoteboothdf = pd.read_csv('./data/remote_booth_station_mta.csv')
#remoteboothdf