In [1]:
import s3fs
import os
import pandas as pd
from zipfile import ZipFile
from collections import OrderedDict

In [2]:
fs = s3fs.S3FileSystem(anon=True)
fs.ls('tripdata')


['tripdata/201306-citibike-tripdata.zip',
 'tripdata/201307-201402-citibike-tripdata.zip',
 'tripdata/201307-citibike-tripdata.zip',
 'tripdata/201308-citibike-tripdata.zip',
 'tripdata/201309-citibike-tripdata.zip',
 'tripdata/201310-citibike-tripdata.zip',
 'tripdata/201311-citibike-tripdata.zip',
 'tripdata/201312-citibike-tripdata.zip',
 'tripdata/201401-citibike-tripdata.zip',
 'tripdata/201402-citibike-tripdata.zip',
 'tripdata/201403-citibike-tripdata.zip',
 'tripdata/201404-citibike-tripdata.zip',
 'tripdata/201405-citibike-tripdata.zip',
 'tripdata/201406-citibike-tripdata.zip',
 'tripdata/201407-citibike-tripdata.zip',
 'tripdata/201408-citibike-tripdata.zip',
 'tripdata/201409-citibike-tripdata.zip',
 'tripdata/201410-citibike-tripdata.zip',
 'tripdata/201411-citibike-tripdata.zip',
 'tripdata/201412-citibike-tripdata.zip',
 'tripdata/201501-citibike-tripdata.zip',
 'tripdata/201502-citibike-tripdata.zip',
 'tripdata/201503-citibike-tripdata.zip',
 'tripdata/201504-citibike-

In [3]:
yearly_data = OrderedDict()
for _file in fs.ls('tripdata'):
    if not _file.endswith('.zip'):
        continue
        
    year = _file.replace('tripdata/','').replace('JC-','')[:4]
    if not year in yearly_data:
        yearly_data[year] = [_file]
    else:
        yearly_data[year].append(_file)
yearly_data

OrderedDict([('2013',
              ['tripdata/201306-citibike-tripdata.zip',
               'tripdata/201307-201402-citibike-tripdata.zip',
               'tripdata/201307-citibike-tripdata.zip',
               'tripdata/201308-citibike-tripdata.zip',
               'tripdata/201309-citibike-tripdata.zip',
               'tripdata/201310-citibike-tripdata.zip',
               'tripdata/201311-citibike-tripdata.zip',
               'tripdata/201312-citibike-tripdata.zip']),
             ('2014',
              ['tripdata/201401-citibike-tripdata.zip',
               'tripdata/201402-citibike-tripdata.zip',
               'tripdata/201403-citibike-tripdata.zip',
               'tripdata/201404-citibike-tripdata.zip',
               'tripdata/201405-citibike-tripdata.zip',
               'tripdata/201406-citibike-tripdata.zip',
               'tripdata/201407-citibike-tripdata.zip',
               'tripdata/201408-citibike-tripdata.zip',
               'tripdata/201409-citibike-tripdata.z

In [4]:
renamed_columns = {'tripduration':'Trip Duration', 'starttime':'Start Time', 'stoptime':'Stop Time', 
                   'start station id':'Start Station ID', 'start station name':'Start Station Name', 
                   'start station latitude':'Start Station Latitude', 'start station longitude':'Start Station Longitude',
                   'end station id':'End Station ID', 'end station name':'End Station Name',
                   'end station latitude':'End Station Latitude', 'end station longitude':'End Station Longitude',
                   'bikeid':'Bike ID', 'usertype':'User Type', 'birth year':'Birth Year', 'gender':'Gender'}
for year, files in yearly_data.items():
    final_df = pd.DataFrame()
    for _file in files:
        fp = fs.open(_file)
        with ZipFile(fp) as zip:
            for index, member in enumerate(zip.namelist()):
                if "__MAC" in member:
                    continue
                with zip.open(member) as csv:
                   member_df = pd.read_csv(csv)
                   member_df = member_df.rename(columns=renamed_columns)
                   print(f"{member} {member_df.shape}")
                   final_df = pd.concat([final_df, member_df], sort=False)
                   print(f"Updated Set - {final_df.shape}")
    output_csv = os.path.join('..', 'data_output', f'{year}.csv')
    final_df.to_csv(output_csv, index=False)

201306-citibike-tripdata.csv (577703, 15)
Updated Set - (577703, 15)
2014-02 - Citi Bike trip data.csv (224736, 15)
Updated Set - (802439, 15)
2014-01 - Citi Bike trip data.csv (300400, 15)
Updated Set - (1102839, 15)
2013-12 - Citi Bike trip data.csv (443966, 15)
Updated Set - (1546805, 15)
2013-11 - Citi Bike trip data.csv (675774, 15)
Updated Set - (2222579, 15)
2013-10 - Citi Bike trip data.csv (1037712, 15)
Updated Set - (3260291, 15)
2013-09 - Citi Bike trip data.csv (1034359, 15)
Updated Set - (4294650, 15)
2013-08 - Citi Bike trip data.csv (1001958, 15)
Updated Set - (5296608, 15)
2013-07 - Citi Bike trip data.csv (843416, 15)
Updated Set - (6140024, 15)
2013-07 - Citi Bike trip data.csv (843416, 15)
Updated Set - (6983440, 15)
2013-08 - Citi Bike trip data.csv (1001958, 15)
Updated Set - (7985398, 15)
2013-09 - Citi Bike trip data.csv (1034359, 15)
Updated Set - (9019757, 15)
2013-10 - Citi Bike trip data.csv (1037712, 15)
Updated Set - (10057469, 15)
2013-11 - Citi Bike trip 

Updated Set - (17882026, 15)
JC-201812-citibike-tripdata.csv (20205, 15)
Updated Set - (17902231, 15)
201901-citibike-tripdata.csv (967287, 15)
Updated Set - (967287, 15)
201902-citibike-tripdata.csv (943744, 15)
Updated Set - (1911031, 15)
JC-201901-citibike-tripdata.csv (19676, 15)
Updated Set - (1930707, 15)
JC-201902-citibike-tripdata.csv (18565, 15)
Updated Set - (1949272, 15)
