## Data Cleaning for MTA DB

Notebook to clean project database.

In [10]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import mtafunctions

In [60]:
from mtafunctions import *

In [59]:
from importlib import reload
reload(mtafunctions)

<module 'mtafunctions' from '/Users/joycetagal/Github/metis-eda/mtafunctions.py'>

In [15]:
engine = create_engine("sqlite:///mta_data.db")

In [16]:
engine.table_names()

['mta_data']

## Data cleaning
### General data cleaning

In [17]:
turnstiles_df = pd.read_sql('SELECT * FROM mta_data', engine)

In [20]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/14/2021,00:00:00,REGULAR,7618705,2606187
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/14/2021,04:00:00,REGULAR,7618717,2606191
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/14/2021,08:00:00,REGULAR,7618732,2606216
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/14/2021,12:00:00,REGULAR,7618788,2606260
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/14/2021,16:00:00,REGULAR,7618905,2606291


In [21]:
# Parse datetime column
turnstiles_df["DATE_TIME"] = pd.to_datetime(turnstiles_df.DATE + " " + turnstiles_df.TIME, 
                                            format="%m/%d/%Y %H:%M:%S")

In [22]:
# Get rid of duplicate entries
turnstiles_df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], 
                          inplace=True, ascending=False)
turnstiles_df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True)

In [28]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
209414,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,21:00:00,REGULAR,5554,613,2021-08-20 21:00:00
209413,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,17:00:00,REGULAR,5554,613,2021-08-20 17:00:00
209412,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,13:00:00,REGULAR,5554,613,2021-08-20 13:00:00
209411,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,09:00:00,REGULAR,5554,613,2021-08-20 09:00:00
209410,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,05:00:00,REGULAR,5554,613,2021-08-20 05:00:00


In [29]:
# Sanity check for duplicates
(turnstiles_df
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2016-08-27 00:00:00,1
5131309,R139,R031,04-00-02,34 ST-PENN STA,2021-05-28 04:00:00,1
5131321,R139,R031,04-00-02,34 ST-PENN STA,2021-05-30 04:00:00,1
5131320,R139,R031,04-00-02,34 ST-PENN STA,2021-05-30 00:00:00,1
5131319,R139,R031,04-00-02,34 ST-PENN STA,2021-05-29 20:00:00,1


In [30]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
209414,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,21:00:00,REGULAR,5554,613,2021-08-20 21:00:00
209413,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,17:00:00,REGULAR,5554,613,2021-08-20 17:00:00
209412,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,13:00:00,REGULAR,5554,613,2021-08-20 13:00:00
209411,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,09:00:00,REGULAR,5554,613,2021-08-20 09:00:00
209410,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,08/20/2021,05:00:00,REGULAR,5554,613,2021-08-20 05:00:00


### Daily exits and entries per turnstile

In [43]:
turnstiles_daily = (turnstiles_df
                    .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"], as_index=False)
                    [['ENTRIES', 'EXITS']].first()
                   )

In [44]:
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,01/01/2021,7511647,2558865
1,A002,R051,02-00-00,59 ST,01/02/2021,7511996,2559007
2,A002,R051,02-00-00,59 ST,01/03/2021,7512214,2559091
3,A002,R051,02-00-00,59 ST,01/04/2021,7512754,2559328
4,A002,R051,02-00-00,59 ST,01/05/2021,7513300,2559580


In [46]:
turnstiles_daily[["PREV_DATE", "PREV_ENTRIES", "PREV_EXITS"]] = (turnstiles_daily
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])[["DATE", "ENTRIES", "EXITS"]]
                                                       .apply(lambda grp: grp.shift(1)))

In [49]:
turnstiles_daily.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [57]:
turnstiles_daily["DAILY_ENTRIES"] = turnstiles_daily.apply(get_daily_entries, axis=1, max_counter=1000000)

5800121 7622455.0
7508852 5827597.0
5330070 6743379.0
6657381 5352683.0
5326133 1638895.0
1530124 5334462.0
238 6641395.0
5033140 62362.0
6612907 5064767.0
4676092 6181446.0
6107562 4702591.0
5917002 7607992.0
7518750 5948375.0
9559835 12220030.0
12070031 9609313.0
7511935 9518226.0
9422266 7553777.0
6606886 187399.0
74937 6641367.0
2410319 4517331.0
4386794 2444534.0
3661348 5607712.0
5504869 3693768.0
2104972 3307986.0
3259087 2123840.0
6827478 96115.0
39794 6846174.0
8022594 99489.0
39474 8047934.0
1952535 757793.0
658599 1985829.0
13721773 1190792.0
1058557 13765445.0
2962458 757047.0
726874 2979594.0
118041253 119815431.0
119727833 118068132.0
3840294 5205321.0
5151329 3867594.0
5822652 145294.0
58912 5856448.0
1557234 5194264.0
5052951 1617108.0
13177753 15723158.0
15644778 13217398.0
9598405 11496210.0
11437505 9625331.0
2330372 3904735.0
3853931 2348373.0
2999078 4882420.0
4808265 3015594.0
51117 2370784.0
2254467 74354.0
15955362 20009116.0
19832695 15992892.0
26523659 4190983

127 1657032.0
8720148 44621.0
1650304 8728982.0
0 419446441.0
419446441 0.0
327680 756269765.0
2188419 61202.0
756204782 2217647.0
5969925 8141014.0
7986321 6000117.0
4805623 718058.0
590651 4832255.0
14369949 17476449.0
17213442 14410597.0
13172561 16079768.0
15861080 13212084.0
371419 2062710.0
1950187 394045.0
262152 553649999.0
553648404 194.0
262152 229572623.0
229572623 262152.0
1050706 2408883.0
2327018 1070146.0
2617710 4233137.0
4138940 2639861.0
8319445 1858234.0
1738605 8346435.0
1890808 4407432.0
4246820 1927265.0
3831270 6420772.0
6221194 3866702.0
353 8049630.0
5389107 24020.0
7883237 5425857.0
5118359 1990891.0
1859430 5152172.0
8489060 10450824.0
10329042 8515681.0
0 5309746.0
100663314 29.0
5309736 100663320.0
67114060 918888.0
807064 67138012.0
1156698 2258968.0
2188605 1170398.0
3436225 5394445.0
5266961 3458552.0
11435265 13834143.0
13673404 11468808.0
1902874 4353142.0
4192073 1938807.0
10016307 12274663.0
12136121 10049232.0
9677197 11663206.0
11548132 9708798.0
2

2147432315 8421808.0
6852836 2147387957.0
8387916 6875918.0
4298914 6083390.0
5994469 4324413.0
116175 1372240.0
1301195 132952.0
1490744 2734732.0
2646483 1507334.0
3066062 291458.0
269279 3089649.0
14391816 17442959.0
17245963 14443058.0
10089876 13346772.0
13157235 10138607.0
2767913 6431314.0
6183776 2826432.0
150994944 458752.0
458752 150994944.0
12556840 16680496.0
16450999 12609476.0
6551721 3406429.0
3206988 6597299.0
11437906 15284828.0
15078283 11487598.0
7667999 11482871.0
11325039 7729610.0
2102260 4921808.0
4757004 2140117.0
6277588 1841609.0
1745785 6303007.0
6526415 9151464.0
9014629 6556055.0
2461557 6365265.0
6114748 2511320.0
91277 1146348.0
515611 2315572.0
2223334 543695.0
1457708 142315.0
24524 1487115.0
14841618 19185414.0
18942775 14900578.0
4373325 959262.0
857034 4399657.0
16712028 6294957.0
177 16712669.0
4865427 63549.0
6268526 4885002.0
2815414 683576.0
616143 2828986.0
4291167 5994562.0
5874298 4317045.0
2228468 108404.0
30164 2243243.0
4623132 5701118.0
59

4228899 1820514.0
1730082 4267570.0
7885472 10596743.0
10480401 7923470.0
6026334 9001702.0
8854938 6068177.0
5946 4298704.0
2075861 25073.0
4227282 2104999.0
6383764 809779.0
787943 6399065.0
4790873 6463397.0
6385571 4812935.0
5993447 8434167.0
8330086 6024844.0
6700873 9351185.0
9219347 6737545.0
5373632 640209.0
531214 5408084.0
1563145 3316406.0
3232892 1589892.0
8182940 9617463.0
9554875 8205565.0
9154760 1283850.0
1217756 9178652.0
12926161 508192.0
370529 12961683.0
50332331 524672.0
524398 50332400.0
0 117440512.0
117440512 0.0
4966214 6383340.0
6312999 4985431.0
2394588 48106.0
10863 2408001.0
4182978 5573693.0
5522664 4200880.0
7466260 1947074.0
1836256 7501141.0
4060777 227269.0
215596 4066866.0
6200495 7257140.0
4740020 6170827.0
6097093 4764379.0
7302394 703296.0
631992 7328603.0
1141762 4855260.0
4718506 1197144.0
2309876 1217693844.0
1217558465 2367239.0
3644576 7438586.0
7298762 3704115.0
18075600 21633224.0
21504062 18128589.0
16083937 19524379.0
19410358 16133430.0
9

1534984195 0.0
4 1534984197.0
67109632 14.0
327682 67109640.0
67109632 1.0
15 67109632.0
4481537 751893.0
666080 4502436.0
552394 2576468.0
2564222 554742.0
1911072 3121643.0
3040091 1929035.0
4598216 5622977.0
2364410 192891.0
123564 2382011.0
12517611 82.0
4500714 12530087.0
726281 4520328.0
2396210 68305.0
9054 2403876.0
36 1198448.0
1894825 118785.0
83939 1901714.0
1843530 392823.0
331401 1857840.0
3582710 41157.0
3823 3591690.0
4635498 481165.0
382880 4656200.0
1537828 2811421.0
2713255 1556228.0
1202629 6722.0
7918034 9433059.0
9352813 7939612.0
5725155 91462.0
21048 5742776.0
5029102 293040.0
122349 5059192.0
23157312 1627480738.0
4960571 23218049.0
1627454017 4985415.0
1401148031 1398753752.0
1398901115 1401112520.0
2334764 5272479.0
5085133 2379065.0
554526016 557878359.0
557662659 554576543.0
8982057 12137266.0
11905708 9026692.0
2438756 721569521.0
721537951 2447987.0
2718944 4311487.0
4244277 2740172.0
46 5405351.0
2281729 24137.0
5249843 2325516.0
67108864 0.0
7886110 1827

662379 1734141.0
6282248 8341544.0
8190177 6309950.0
9692265 11245560.0
11148764 9714474.0
1238513 2346286.0
2281858 1254185.0
10342541 11731882.0
11635762 10362357.0
327 2524852.0
2368665 632959.0
1529388 4757713.0
4569463 1574799.0
1807181 5606797.0
5392484 1859753.0
497680 1800855.0
1724225 515172.0
3722228 845357.0
762490 3736416.0
2044591 84435714.0
84393912 2055242.0
4885840 5898217.0
2581261 51675.0
91732 2595156.0
3276409 266407.0
200583 3285910.0
819416 1910774.0
1843812 834359.0
8636773 10450994.0
10338344 8662330.0
1379094 254514.0
221593 1385163.0
3163323 5109669.0
4959352 3192547.0
808010 2083521.0
2001224 826579.0
6366425 7619164.0
7555408 6383296.0
2229274 3385092.0
3319388 2250895.0
3303311 6251111.0
6058674 3347980.0
7543065 9938038.0
9774180 7576892.0
1806095 131401.0
40744 1826553.0
6122256 7267833.0
7188517 6137874.0
3700357 320571.0
294439 3708189.0
8355828 9924939.0
9837948 8377261.0
2734095 5055923.0
4918864 2766025.0
2164175 5157989.0
4931206 2208149.0
18945724 

2821897 3865298.0
2064469 3340584.0
3254623 2081122.0
6322991 7967852.0
7876157 6346187.0
1084260 2289460.0
6395102 7661108.0
7567845 6411630.0
6432204 7564168.0
7498886 6447759.0
11176627 12616977.0
12538084 11198124.0
2493756 3840687.0
3768022 2513291.0
100745471 31921.0
6142 100752076.0
9671774 118385602.0
118343632 9708132.0
3744634 5608731.0
5501862 3771026.0
67118203 429968.0
409132 67121704.0
262144 537268042.0
537268042 713185.0
1689147 2867903.0
2819776 1704578.0
611076 3229721.0
3082363 651566.0
118033043 119148576.0
119088992 118047558.0
3839348 4885586.0
6262787 7971475.0
7850883 6288890.0
4397279 36997.0
23475 4404076.0
4272022 411621.0
362627 4278280.0
5181640 6258357.0
7717889 9422989.0
9309832 7745192.0
11694593 14710061.0
14494833 11741844.0
262185 67617764.0
1289578 289390.0
67615434 1300018.0
67145668 645665.0
613734 67150924.0
67112097 599240.0
579579 67113355.0
2494441 4288908.0
4166103 2525630.0
9246672 10838625.0
10720576 9283352.0
5244819 2023152.0
1975922 52603

2293760 458752.0
458752 2293760.0
1036427240 1038082015.0
1038018336 1036452632.0
986229 2246436.0
1443 1014193.0
2134151 3392.0
4081900 297145.0
244928 4098183.0
1805046 2976725.0
2927027 1821842.0
4808937 5994949.0
5948234 4826282.0
7100772 8536288.0
8479284 7121823.0
126 3102523.0
1133665 8300.0
3021415 1162230.0
10083349 11440194.0
11404670 10102147.0
13045743 15072583.0
15008500 13073740.0
2795842 5844877.0
5726127 2836997.0
15439231 6237114.0
338 15440422.0
3139529 75103.0
6228084 3182129.0
9449706 12649196.0
12488037 9493555.0
3431 1360129.0
1360035 3450.0
7414680 681245.0
636190 7447837.0
34507440 1997699.0
1943241 34533859.0
101276298 21106.0
19 101292247.0
2699201 262558.0
262160 2701722.0
4098669 14139.0
232928 4125093.0
4049396 385337.0
262501 4085621.0
5298014 348208.0
230376 5338198.0
1511128904 365603.0
293690 1511183620.0
8177874 318787.0
150065 8248205.0
1226549062 879867.0
687682 1226593322.0
9078590 11290943.0
11168764 9109512.0
10349106 12723684.0
12592986 10382866.

4518224 227766.0
102973 4552599.0
2309555 4414798.0
4315415 2338771.0
1899974 9474.0
11705 1907411.0
2484600 3536924.0
7023467 8325131.0
8266568 7043622.0
36 1380657.0
1711628 2839462.0
2794460 1727708.0
2425551 3789023.0
3738858 2445739.0
3370911 5905864.0
5790925 3408290.0
393362 104110933.0
101628878 409155.0
104046992 101659639.0
117552482 120119602.0
120001120 117583878.0
447309 2488422.0
2440612 476188.0
9288969 10772507.0
10752226 9309618.0
4081044 959146.0
947488 4093771.0
2657417 4859792.0
4851030 2683499.0
529381 591941635.0
591954143 566141.0
4486580 7703012.0
7681579 4539607.0
4433868 9028001.0
8989753 4515272.0
5328675 18867.0
12 5333964.0
6457244 23764.0
5 6465652.0
4805185 6247896.0
6219146 4817832.0
4853537 30800.0
33 4874061.0
8044997 1254619187.0
1254645560 8080476.0
1945768 287545.0
162249 1965501.0
5979425 174396.0
40963 6000927.0
315 2844675.0
2834681 915337.0
1639903 219603.0
186655 1672453.0
2633949 686045.0
660231 2702583.0
12865172 305545.0
271774 12909372.0
17

5059515 73275498.0
1560526265 5078311.0
117550546 235714.0
183234 117559063.0
10151212 13913707.0
13681593 10204539.0
9623604 12202703.0
12049087 9663194.0
2951746 67286086.0
67217309 2970170.0
1695868 75800.0
820 1705758.0
4177532 511510.0
458219 4192869.0
6890632 8969498.0
8806980 6919152.0
947187 285348610.0
285327798 951604.0
4420902 5422168.0
805969 2264971.0
2148830 826994.0
117440532 458752.0
0 117440532.0
35 10737302.0
8793633 18010.0
10639079 8822950.0
12192558 99015438.0
98793051 12241617.0
11863528 15098557.0
14903222 11913640.0
268587154 336482.0
295280 268594117.0
5852896 8040065.0
7895521 5885362.0
168410328 169836578.0
169752808 168430920.0
326385 1420234.0
1353912 342967.0
10721239 1174661.0
960587 10768786.0
8649194 208428.0
88679 8683291.0
689826876 688183085.0
688287177 689803771.0
4729063 1377850.0
1274167 4748559.0
6167560 7647129.0
7557089 6188846.0
2197337 3655165.0
3574441 2221541.0
1935663 406889.0
294934 1961067.0
420369197 555493239.0
555406806 420393889.0
34

1706625 42455.0
40156 1713772.0
7692872 10167647.0
9922292 7730250.0
7464216 9462601.0
9284981 7492344.0
5882344 712762.0
600941 5901467.0
1128308 2157993.0
3678085 1092145.0
1019662 3695923.0
8879420 164331.0
47434 8908067.0
14922808 18203274.0
18004984 14970803.0
632 4266134.0
3478355 237979.0
11507800 17712420.0
17237544 11607008.0
2328939 784197.0
461342 2387582.0
370083 3708148.0
3480458 418991.0
12501255 348496.0
141385 12545164.0
12559066 16022234.0
15767851 12608487.0
16115856 605208.0
251425 16180022.0
1057818 2802007.0
2380820 1135656.0
4844094 6290726.0
7040786 8956391.0
8844474 7069324.0
9677618 12106930.0
12000634 9715192.0
12647198 264835.0
137225 12694796.0
19855764 23416735.0
23343891 19912701.0
17128835 20802391.0
20714967 17187934.0
4863390 7566428.0
7497833 4906660.0
184934 1357607.0
1267809 195449.0
278467 1955684.0
1833658 294935.0
346766 2285177.0
2149878 367269.0
354204 2355293.0
2232117 377208.0
303314 2045803.0
1937908 322571.0
2257130 359198.0
275268 2271895.0

In [61]:
turnstiles_daily["DAILY_EXITS"] = turnstiles_daily.apply(get_daily_exits, axis=1, max_counter=1000000)

2857060 5475192.0
5305553 2900775.0
8425423 2563927.0
2380419 8463812.0
88 8869456.0
6968307 86951.0
8828255 7003528.0
5604612 7044455.0
6946538 5632482.0
4186397 7785044.0
7552296 4236994.0
2351890 3441126.0
3379453 2368378.0
3304121 16611620.0
16581243 3313083.0
3013995 37763.0
14583 3020969.0
3593326 52970.0
21952 3602208.0
14275947 1161352.0
953691 14313161.0
6393508 572446.0
488027 6414776.0
993921 3413230.0
3294905 1036721.0
2307425 45043.0
16142 2317676.0
661466 3129648.0
2921787 683149.0
4883362 6281066.0
6190195 4895876.0
4566586 5568475.0
5415473 683597.0
557466 5432725.0
493419121 1706860.0
1607487 493367005.0
16992789 1195246.0
1104661 17016345.0
1292483753 1375718.0
1254607 1292504527.0
1257498309 1264722.0
1118370 1257472027.0
1777253508 3184953.0
153 1777224891.0
2985544 7356.0
45524 18816668.0
18808947 134386.0
33166853 39091136.0
38821593 33254920.0
4083484 5980618.0
5950976 4112485.0
1706181 2748662.0
2729906 1725132.0
1448464 3265645.0
3140222 1481702.0
128253 516793

8365801 9587024.0
9471890 8382246.0
4328099 5553639.0
5356438 4336053.0
7672025 1140713.0
899542 7698465.0
9353672 11493693.0
11312008 9387479.0
9848326 11805845.0
11673873 9881880.0
6102245 7418614.0
7314431 6119676.0
463975 1477151.0
2709683 744079.0
574433 2747495.0
5517377 4085621.0
3939266 5593851.0
79293 5328493.0
5207023 99096.0
2562195 8891858.0
8792925 2574247.0
5733322 2921271.0
2802220 5739287.0
1431106 42696.0
31700 1439122.0
9670977 10810886.0
10781129 9718110.0
6347446 318578.0
255854 6347534.0
4665016 2007380.0
1962398 4665660.0
3641871 966441.0
835660 3689857.0
4030969 105735.0
5364 4059833.0
8172558 1762743.0
1641963 8212547.0
5265988 261622.0
197095 5286693.0
3690380 4795258.0
4941563 5986776.0
4311064 615159.0
498076 4314578.0
483734 1547400.0
7455789 9006482.0
8889559 7475747.0
137906 2024905.0
1875235 162816.0
3485973 6175893.0
5929905 3520550.0
16777216 0.0
0 16777216.0
67208614 143978.0
115882 67212928.0
4555783 5903850.0
5770863 4574149.0
98888 1627567.0
1483644

9307342 10539453.0
10481234 9324070.0
2821203 5099660.0
4970408 2852396.0
18764350 21791320.0
21620143 18807473.0
975959 2384916.0
2262294 993022.0
4094227 5473457.0
5386941 4112474.0
4561682 6080151.0
5984874 4581811.0
7681970 9305491.0
9185001 7704530.0
1959602 20302939.0
20116577 2010211.0
17289027 5553908.0
5283621 17332070.0
3472793 7129220.0
6910373 3531358.0
2583415 756839.0
592113 2623231.0
15674568 1145.0
156620 15713358.0
2347780 3481838.0
3401189 2363614.0
3767048 15210.0
528197 3776855.0
8848856 10334721.0
10182554 8865947.0
55866073 9799105.0
8910024 55987781.0
170746875 179360019.0
178646027 170834848.0
219883799 225250954.0
224791778 219938240.0
108327196 2703851.0
2453363 108357082.0
805653596 806675304.0
923295769 924706500.0
924595483 923311503.0
1829238600 1830724438.0
1830564732 1829254009.0
9478773 10527136.0
133440 1268423.0
1201399 151942.0
10662451 526410.0
418535 10682931.0
3265094 8527122.0
8434214 3279059.0
11641615 3950979.0
3382284 11750174.0
18770512 35658

1276470 604625854.0
604464499 1297911.0
465 705468481.0
2288390 5079.0
705344826 2305413.0
3890 16992139.0
16944574 11153.0
440073 2336167.0
2182905 462536.0
3644231 4862626.0
4769808 3659209.0
1914595 110219.0
29854 1926808.0
4175744 5858472.0
5689946 4199117.0
158 11465591.0
9212870 159956.0
11393977 9243197.0
801515 202458695.0
201391476 803021.0
1517268 57473.0
5080 1526041.0
1126518 115894.0
33773 1155069.0
9519317 12529686.0
12385905 9558422.0
7935068 10484262.0
10354953 7967530.0
772349829 378825.0
307501 772364753.0
5361427 8329774.0
8182140 5397381.0
3582225 6772908.0
6613097 3625149.0
1124269 4180201.0
4032254 1164042.0
6304258 146792.0
46163 6335344.0
501275 1963599.0
1894633 521278.0
26804895 32016306.0
31671585 26804895.0
9832995 11500895.0
11424116 9866242.0
1386871 2924180.0
2858661 1406917.0
7681576 9438768.0
9364359 7705425.0
2632235 5761279.0
5611996 2673341.0
1679282 2910652.0
2872280 1694625.0
2393850 102112.0
98078 2395417.0
15866725 18086176.0
17992311 15898030.0


1179 3095180.0
1586109 681.0
3033949 1612713.0
1769164 56082.0
176874 1795196.0
1923934 3267254.0
3201356 1943817.0
3789099 6701433.0
6503614 3824123.0
2363157 4771312.0
4581756 2392086.0
16777827 65676.0
65573 16777859.0
0 1544057837.0
1544057837 0.0
17666723 22919709.0
22437322 17736720.0
5295202 2086393.0
1817378 5340049.0
359 1406320670.0
1408450717 166069.0
1406329319 1408419976.0
1270292 3303101.0
3156197 1299362.0
13744807 15819942.0
15669918 13774460.0
6359657 8692353.0
8504260 6393230.0
4358011 7072840.0
6823990 4398053.0
2990222 1062991.0
713931 3037611.0
8449139 11074559.0
10882380 8482572.0
3104317 5056154.0
4928116 3131138.0
4108373 5663824.0
5547957 4130226.0
5082022 92487.0
22347 5110952.0
22840254 27406166.0
27058729 22901692.0
5570519 9020502.0
8786097 5613400.0
11607126 14948054.0
14722071 11647001.0
1386590 3695876.0
3480669 1418675.0
3976027 665952.0
511500 3995767.0
4643574 323846.0
203746 4663973.0
5971311 7889371.0
7748387 5997555.0
2318118 4523563.0
4364378 2348

359 17305251.0
1740558 4851496.0
4754629 1784849.0
319398498 320611417.0
320564415 319421567.0
5673816 9152111.0
8985932 5723242.0
6357157 729415.0
487564 6424859.0
21607505 671828.0
419052 21678373.0
85923976 457638.0
211833 85989081.0
7135923 179869.0
375984 7189629.0
12846469 602991.0
507696 12877676.0
1531694 3049766.0
2948245 1553668.0
2376375 6821999.0
6518151 2425104.0
6775161 150507.0
36754 6813965.0
74 4035691.0
2603022 35686.0
4003462 2635585.0
3675746 55671.0
14758 3696157.0
748655 1772227.0
2920935 129312.0
92130 2943402.0
1901145 54350.0
15030 1917730.0
2642442 23520.0
23041 2662543.0
5466827 97711.0
38959 5483862.0
6291236 7627350.0
7589316 6306596.0
7766567 9860868.0
9791351 7793178.0
654378 3722924.0
3605333 696009.0
23677321 27789821.0
27604440 23735444.0
4045768 6896564.0
6715271 4086901.0
294054 2201113.0
2071354 322181.0
1738417 2923847.0
2858027 1756385.0
1630944 359213.0
283445 1647466.0
2916538 149896.0
126159 2926341.0
750385 1816779.0
1702016 2521.0
10424781 91

1150903 2716756.0
2615498 1175020.0
7360730 8603468.0
8538273 7381849.0
27818712 2295875.0
1929475 27910524.0
2366389 4379921.0
4290353 2402505.0
386191505 38232.0
30967 386195842.0
50377384 40936.0
36423 50378240.0
1296099 2542103.0
2473883 1314287.0
4580446 1085674.0
176 4593453.0
1023547 2272.0
2759580 31223672.0
31176916 2774335.0
14448157 16660166.0
16488645 14477041.0
3346911 494512.0
477659 3355674.0
21623919 25693242.0
25498611 21680846.0
184955 2459375.0
2369441 216553.0
386300298 569521976.0
569483004 386314009.0
1875955 232819.0
213391 1883231.0
3673535 171398.0
142246 3682773.0
58943 1215338.0
1170770 74155.0
10363811 12212086.0
12132917 10388252.0
12659505 14812336.0
14692281 12689021.0
14268150 2200871.0
1928961 14324718.0
8093204 9381797.0
9302714 8113795.0
3577009 5054126.0
4955090 3598888.0
2046252 4753212.0
4544693 2086865.0
4250184 6811598.0
6588691 4289622.0
702276 1826364.0
1738385 717617.0
651441 1743628.0
14706630 20137389.0
19829262 14785948.0
2593871 4136021.0


653802 3165730.0
2920895 683849.0
1342348 2417892.0
2899871 41375.0
69830 2908573.0
8901848 10433180.0
10282701 8926960.0
33554621 222.0
88 33554624.0
3031603 4940028.0
4608081 3110909.0
1370399 4087188.0
3937697 1405866.0
1898278 164027.0
106035 1914730.0
269742273 260351.0
227859 269748755.0
3437707 133300.0
100555 3449888.0
16783545 140877.0
122856 16788652.0
4533837 7699719.0
7424130 4578910.0
1866465 4241805.0
4078694 1898394.0
10009533 13717658.0
13471724 10061526.0
2652014 4233872.0
4062882 2673803.0
2800759 4114019.0
4032321 2822154.0
3894971 4920650.0
3695182 95675.0
42411 3705235.0
3495108 184247.0
68039 3515913.0
4569230 6297649.0
6181722 4593687.0
4961931 6203640.0
6130985 4979459.0
625431 1716642.0
1644961 640386.0
7140021 1865995.0
1666840 7169811.0
4721793 6124031.0
6015200 4739123.0
464277 1527267.0
6644811 9249858.0
8903061 6656492.0
5494611 6783362.0
6663080 5510064.0
2113548 3283366.0
3210522 2140050.0
10361045 13251570.0
12837178 10391098.0
3784073 6597387.0
6383663

341 1078346884.0
1078346876 341.0
2410679 2.0
2 2410679.0
3980632 0.0
0 3980632.0


In [62]:
turnstiles_daily.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,EXITS,PREV_DATE,PREV_ENTRIES,PREV_EXITS,DAILY_ENTRIES,DAILY_EXITS
1,A002,R051,02-00-00,59 ST,01/02/2021,7511996,2559007,01/01/2021,7511647.0,2558865.0,349.0,142.0
2,A002,R051,02-00-00,59 ST,01/03/2021,7512214,2559091,01/02/2021,7511996.0,2559007.0,218.0,84.0
3,A002,R051,02-00-00,59 ST,01/04/2021,7512754,2559328,01/03/2021,7512214.0,2559091.0,540.0,237.0
4,A002,R051,02-00-00,59 ST,01/05/2021,7513300,2559580,01/04/2021,7512754.0,2559328.0,546.0,252.0
5,A002,R051,02-00-00,59 ST,01/06/2021,7513790,2559817,01/05/2021,7513300.0,2559580.0,490.0,237.0


In [63]:
station_daily = turnstiles_daily.groupby(["STATION", "DATE"])[['DAILY_ENTRIES', 'DAILY_EXITS']].sum().reset_index()

In [65]:
station_daily.head()

Unnamed: 0,STATION,DATE,DAILY_ENTRIES,DAILY_EXITS
0,1 AV,01/02/2021,3673.0,6054.0
1,1 AV,01/03/2021,2547.0,4537.0
2,1 AV,01/04/2021,5303.0,8145.0
3,1 AV,01/05/2021,5468.0,8332.0
4,1 AV,01/06/2021,5603.0,8532.0


In [66]:
station_daily.shape

(97466, 4)

In [87]:
# Get station names from df

station_names = pd.Series(station_daily['STATION'].unique())

In [143]:
station_strkeep = ['1 AV', '34 ST-PENN STA', '34 ST-HERALD SQ', '33 ST', '28 ST', '23 ST', '18 ST', '14 ST', 
                   '14 ST-UNION SQ', '8 ST-NYU', 'ASTOR PL', '3 AV', 'W 4 ST-WASH SQ', 'BLEECKER ST', "B'WAY-LAFAYETTE",
                   '2 AV', '34 ST-HUDSON YD', 'DELANCEY/ESSEX', 'PRINCE ST', 'CHRISTOPHER ST', 'HOUSTON ST', 'SPRING ST',
                   'CANAL ST', 'BOWERY', 'GRAND ST', 'EAST BROADWAY', 'CITY HALL', 'CORTLANDT ST', 'RECTOR ST', 'FRANKLIN ST',
                   'CHAMBERS ST', 'BROOKLYN BRIDGE', 'FULTON ST', 'WALL ST', 'PARK PLACE', 'WORLD TRADE CTR', 'BOWLING GREEN',
                   'SOUTH FERRY', 'WHITEHALL S-FRY'                  
                  ]

In [206]:
stationpoints = pd.read_csv('/Users/joycetagal/Github/metis-eda/DOITT_SUBWAY_STATION_01_13SEPT2010.csv')
stationpoints['NAME'] = stationpoints['NAME'].str.upper()

In [214]:
stationpoints.size

2838

In [215]:
## Take the first location for each station (to simplify)
stationpoints = stationpoints.groupby('NAME', as_index=False).NAME.first()

In [218]:
stationpoints.size

355

In [285]:
stationpoints.replace('8 ST - NYU', "8 ST-NYU", inplace=True)

In [287]:
station_csv_names = stationpoints[stationpoints['NAME'].isin(station_strkeep)]['NAME']

In [1]:
stations_lower = stationpoints[stationpoints['STATION'].isin(station_strkeep)]

NameError: name 'stationpoints' is not defined

In [298]:
stations_lower.to_csv('lowermanstationlocs.csv')