In [26]:
import numpy as np
import s3fs
import requests
import pandas as pd
import zipfile
import io


In [67]:
def print_csv_names(bucket_name, file_name):
    path_to_file = f'../data/{bucket_name}/{file_name}'
    zip_file = zipfile.ZipFile(path_to_file)
    fs = {month: {} for month in month_list}
    zip_file.printdir()
    for text_file in zip_file.infolist():
        if text_file.filename.endswith('.csv'):
            print(text_file.filename)

In [68]:
print_csv_names('tripdata', '2013-citibike-tripdata.zip')

File Name                                             Modified             Size
2013-citibike-tripdata/                        2024-02-22 00:06:56            0
2013-citibike-tripdata/4_April/                2024-02-14 10:09:16            0
2013-citibike-tripdata/12_December/            2024-02-22 00:03:04            0
2013-citibike-tripdata/.DS_Store               2024-02-22 00:07:16        12292
__MACOSX/2013-citibike-tripdata/._.DS_Store    2024-02-22 00:07:16          120
2013-citibike-tripdata/201309-citibike-tripdata.csv 2014-03-27 23:40:34    201965642
__MACOSX/2013-citibike-tripdata/._201309-citibike-tripdata.csv 2014-03-27 23:40:34          276
2013-citibike-tripdata/11_November/            2024-02-22 00:03:00            0
2013-citibike-tripdata/7_July/                 2024-02-22 00:02:38            0
2013-citibike-tripdata/201311-citibike-tripdata.csv 2014-03-27 23:43:44    131891356
__MACOSX/2013-citibike-tripdata/._201311-citibike-tripdata.csv 2014-03-27 23:43:44          27

In [86]:
month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

def load_zip_csv_offline(bucket_name, file_name):
    
    path_to_file = f'../data/{bucket_name}/{file_name}.zip'
    zip_file = zipfile.ZipFile(path_to_file, 'r')
    print(zip_file.namelist())
    fs = {month: [] for month in month_list}
    print(zip_file)
    for text_file in zip_file.infolist():
        cur_filename = text_file.filename
        if cur_filename.endswith('.csv') and not cur_filename.startswith('__MACOSX'):
            dir_names = text_file.filename.split('/')
            print(dir_names)
            if len(dir_names) == 3:
                    year = dir_names[0].split('-')[0]
                    month = dir_names[1].split('_')[1]
                    if month in month_list:
                        # Per month there can be more than one CSV file
                        fs[month].append(pd.read_csv(zip_file.open(text_file.filename), encoding='latin-1'))     
    return fs

def load_zip_csv_url(bucket_name, file_name):
    response = requests.get(f'https://s3.amazonaws.com/{bucket_name}/{path_to_file}')
    if response.status_code == 200:
        print("Successfully downloaded the zip file.")
        
        # Load the zip file into memory
        buffer = io.BytesIO(response.content)
        
        # Open the zip file
        with zipfile.ZipFile(buffer) as z:
            # List files in the archive
            csv_files = [f for f in z.namelist() if f.endswith('.csv')]
            print(csv_files)
            print(f"Found {len(csv_files)} CSV files in the zip file.")

def load_zip_csv(bucket_name, file_name):
    try:
        print('Trying to load file from disk.')
        load_zip_csv_offline(bucket_name, file_name)
        print('Loaded file from disk.')

    except FileNotFoundError:
        print('Could not load from disk, loading from URL.')
        load_zip_csv_url(bucket_name, file_name)
        print('Loaded file from URL.')
    except:
        raise FileNotFoundError("Could not load file from disk or URL.")


In [87]:
data = load_zip_csv_offline('tripdata', '2013-citibike-tripdata')

['2013-citibike-tripdata/', '2013-citibike-tripdata/4_April/', '2013-citibike-tripdata/12_December/', '2013-citibike-tripdata/.DS_Store', '__MACOSX/2013-citibike-tripdata/._.DS_Store', '2013-citibike-tripdata/201309-citibike-tripdata.csv', '__MACOSX/2013-citibike-tripdata/._201309-citibike-tripdata.csv', '2013-citibike-tripdata/11_November/', '2013-citibike-tripdata/7_July/', '2013-citibike-tripdata/201311-citibike-tripdata.csv', '__MACOSX/2013-citibike-tripdata/._201311-citibike-tripdata.csv', '2013-citibike-tripdata/201307-citibike-tripdata.csv', '__MACOSX/2013-citibike-tripdata/._201307-citibike-tripdata.csv', '2013-citibike-tripdata/10_October/', '2013-citibike-tripdata/9_September/', '2013-citibike-tripdata/8_August/', '2013-citibike-tripdata/6_June/', '2013-citibike-tripdata/3_March/', '2013-citibike-tripdata/201308-citibike-tripdata.csv', '__MACOSX/2013-citibike-tripdata/._201308-citibike-tripdata.csv', '2013-citibike-tripdata/1_January/', '2013-citibike-tripdata/201306-citibike

In [88]:
print(data)


{'January': [], 'February': [], 'March': [], 'April': [], 'May': [], 'June': [        tripduration            starttime             stoptime  \
0                695  2013-06-01 00:00:01  2013-06-01 00:11:36   
1                693  2013-06-01 00:00:08  2013-06-01 00:11:41   
2               2059  2013-06-01 00:00:44  2013-06-01 00:35:03   
3                123  2013-06-01 00:01:04  2013-06-01 00:03:07   
4               1521  2013-06-01 00:01:22  2013-06-01 00:26:43   
...              ...                  ...                  ...   
577698           925  2013-06-30 23:59:27  2013-07-01 00:14:52   
577699           279  2013-06-30 23:59:36  2013-07-01 00:04:15   
577700           161  2013-06-30 23:59:33  2013-07-01 00:02:14   
577701           909  2013-06-30 23:59:47  2013-07-01 00:14:56   
577702           634  2013-07-01 00:00:00  2013-07-01 00:10:34   

        start station id      start station name  start station latitude  \
0                    444      Broadway & W 24 St     

In [44]:
#citibike_data_path = './data/citibike/'
#path_to_file = '2013-citibike-tripdata.zip'
#"C:\Users\Joris\Projects\coding-challenge-AXA\data\tripdata\2013-citibike-tripdata.zip"
load_zip_csv_offline('tripdata', '2013-citibike-tripdata.zip')
load_zip_csv_url('tripdata', '2013-citibike-tripdata.zip')

2013-citibike-tripdata/201309-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201309-citibike-tripdata.csv
2013-citibike-tripdata/201311-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201311-citibike-tripdata.csv
2013-citibike-tripdata/201307-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201307-citibike-tripdata.csv
2013-citibike-tripdata/201308-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201308-citibike-tripdata.csv
2013-citibike-tripdata/201306-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201306-citibike-tripdata.csv
2013-citibike-tripdata/201310-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201310-citibike-tripdata.csv
2013-citibike-tripdata/201312-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201312-citibike-tripdata.csv
2013-citibike-tripdata/12_December/201312-citibike-tripdata_1.csv
2013-citibike-tripdata/11_November/201311-citibike-tripdata_1.csv
2013-citibike-tripdata/7_July/201307-citibike-tripdata_1.csv
20

In [47]:
for i in range(0, 12):
    load_zip_csv_offline('tripdata', f'{2013 + i}-citibike-tripdata.zip')

2013-citibike-tripdata/201309-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201309-citibike-tripdata.csv
2013-citibike-tripdata/201311-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201311-citibike-tripdata.csv
2013-citibike-tripdata/201307-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201307-citibike-tripdata.csv
2013-citibike-tripdata/201308-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201308-citibike-tripdata.csv
2013-citibike-tripdata/201306-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201306-citibike-tripdata.csv
2013-citibike-tripdata/201310-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201310-citibike-tripdata.csv
2013-citibike-tripdata/201312-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201312-citibike-tripdata.csv
2013-citibike-tripdata/12_December/201312-citibike-tripdata_1.csv
2013-citibike-tripdata/11_November/201311-citibike-tripdata_1.csv
2013-citibike-tripdata/7_July/201307-citibike-tripdata_1.csv
20

FileNotFoundError: [Errno 2] No such file or directory: '../data/tripdata/2024-citibike-tripdata.zip'