In [3]:
import numpy as np
import s3fs
import requests
import pandas as pd
import zipfile
import io


In [4]:
def print_csv_names(bucket_name, file_name):
    path_to_file = f'../data/{bucket_name}/{file_name}'
    zip_file = zipfile.ZipFile(path_to_file)
    fs = {month: {} for month in month_list}
    #zip_file.printdir()
    for text_file in zip_file.infolist():
        if text_file.filename.endswith('.csv'):
            print(text_file.filename)

In [5]:
print_csv_names('tripdata', '2013-citibike-tripdata.zip')

2013-citibike-tripdata/201309-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201309-citibike-tripdata.csv
2013-citibike-tripdata/201311-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201311-citibike-tripdata.csv
2013-citibike-tripdata/201307-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201307-citibike-tripdata.csv
2013-citibike-tripdata/201308-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201308-citibike-tripdata.csv
2013-citibike-tripdata/201306-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201306-citibike-tripdata.csv
2013-citibike-tripdata/201310-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201310-citibike-tripdata.csv
2013-citibike-tripdata/201312-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201312-citibike-tripdata.csv
2013-citibike-tripdata/12_December/201312-citibike-tripdata_1.csv
2013-citibike-tripdata/11_November/201311-citibike-tripdata_1.csv
2013-citibike-tripdata/7_July/201307-citibike-tripdata_1.csv
20

# I only extract the monthly csv data, e.g. 
### 2013-citibike-tripdata/10_October/201310-citibike-tripdata_1.csv and 
### 2013-citibike-tripdata/10_October/201310-citibike-tripdata_2.csv
### for October

In [16]:
month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

# Input: path or buffer
# Output: dict with month as key and list of pandas dataframes as values
def process_zip_file(input_data) -> dict:
    fs = {month: [] for month in month_list}
    
    with zipfile.ZipFile(input_data, 'r') as zip_file:
        for text_file in zip_file.infolist():
            cur_filename = text_file.filename
            if cur_filename.endswith('.csv') and not cur_filename.startswith('__MACOSX'):
                dir_names = text_file.filename.split('/')
                if len(dir_names) == 3:
                        year = dir_names[0].split('-')[0]
                        month = dir_names[1].split('_')[1]
                        if month in month_list:
                            print(f'Processing {dir_names}')

                            # encoding utf-8 results in UnicodeDecodeError, see:
                            # https://stackoverflow.com/questions/5552555/unicodedecodeerror-invalid-continuation-byte
                            month_df = pd.read_csv(zip_file.open(text_file.filename), encoding='latin-1')
                            # Per month there can be one or two CSV files
                            fs[month].append(month_df)
    return fs

# Creates zipfile.Zipfile from csv.zip saved on local disk
# Tries to get csv from ../data/BUCKET_NAME/FILE_NAME.zip
def load_zip_csv_offline(bucket_name: str, file_name: str) -> dict:
    path_to_file = f'../data/{bucket_name}/{file_name}.zip'
    
    return process_zip_file(path_to_file)


# Creates zipfile.Zipfile from public access s3 URL
def load_zip_csv_url(bucket_name, file_name):
    response = requests.get(f'https://s3.amazonaws.com/{bucket_name}/{file_name}.zip')
    if response.status_code == 200:
        print("Successfully downloaded the zip file.")
        # Load the zip file into memory
        buffer = io.BytesIO(response.content)
        
        return process_zip_file(buffer)
    else:
        print(f'Get Error {}'response.status_code)
        
        # Open the zip file
        #with zipfile.ZipFile(buffer) as z:
            # List files in the archive
        #    csv_files = [f for f in z.namelist() if f.endswith('.csv')]
        #    print(csv_files)
        #    print(f"Found {len(csv_files)} CSV files in the zip file.")

def load_zip_csv(bucket_name, file_name):
    try:
        print('Trying to load file from disk.')
        load_zip_csv_offline(bucket_name, file_name)
        print('Loaded file from disk.')

    except FileNotFoundError:
        print('Could not load from disk, loading from URL.')
        load_zip_csv_url(bucket_name, file_name)
        print('Loaded file from URL.')
    except:
        raise FileNotFoundError("Could not load file from disk or URL.")


In [9]:
data = load_zip_csv_offline('tripdata', '2013-citibike-tripdata')

Processing ['2013-citibike-tripdata', '12_December', '201312-citibike-tripdata_1.csv']
Processing ['2013-citibike-tripdata', '11_November', '201311-citibike-tripdata_1.csv']
Processing ['2013-citibike-tripdata', '7_July', '201307-citibike-tripdata_1.csv']
Processing ['2013-citibike-tripdata', '10_October', '201310-citibike-tripdata_2.csv']
Processing ['2013-citibike-tripdata', '10_October', '201310-citibike-tripdata_1.csv']
Processing ['2013-citibike-tripdata', '9_September', '201309-citibike-tripdata_2.csv']
Processing ['2013-citibike-tripdata', '9_September', '201309-citibike-tripdata_1.csv']
Processing ['2013-citibike-tripdata', '8_August', '201308-citibike-tripdata_1.csv']
Processing ['2013-citibike-tripdata', '8_August', '201308-citibike-tripdata_2.csv']
Processing ['2013-citibike-tripdata', '6_June', '201306-citibike-tripdata_1.csv']


In [None]:
data_from_s3 = load_zip_csv_url('tripdata', '2013-citibike-tripdata')

In [10]:
print(data['October'][1].head())

   tripduration            starttime             stoptime  start station id  \
0           326  2013-10-01 00:01:08  2013-10-01 00:06:34               239   
1           729  2013-10-01 00:01:21  2013-10-01 00:13:30               322   
2           520  2013-10-01 00:01:24  2013-10-01 00:10:04               174   
3           281  2013-10-01 00:01:25  2013-10-01 00:06:06               430   
4           196  2013-10-01 00:01:27  2013-10-01 00:04:43               403   

         start station name  start station latitude  start station longitude  \
0  Willoughby St & Fleet St               40.691966               -73.981302   
1   Clinton St & Tillary St               40.696192               -73.991218   
2           E 25 St & 1 Ave               40.738177               -73.977387   
3          York St & Jay St               40.701485               -73.986569   
4            E 2 St & 2 Ave               40.725029               -73.990697   

   end station id             end station na

In [44]:
#citibike_data_path = './data/citibike/'
#path_to_file = '2013-citibike-tripdata.zip'
#"C:\Users\Joris\Projects\coding-challenge-AXA\data\tripdata\2013-citibike-tripdata.zip"
load_zip_csv_offline('tripdata', '2013-citibike-tripdata.zip')
load_zip_csv_url('tripdata', '2013-citibike-tripdata.zip')

2013-citibike-tripdata/201309-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201309-citibike-tripdata.csv
2013-citibike-tripdata/201311-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201311-citibike-tripdata.csv
2013-citibike-tripdata/201307-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201307-citibike-tripdata.csv
2013-citibike-tripdata/201308-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201308-citibike-tripdata.csv
2013-citibike-tripdata/201306-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201306-citibike-tripdata.csv
2013-citibike-tripdata/201310-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201310-citibike-tripdata.csv
2013-citibike-tripdata/201312-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201312-citibike-tripdata.csv
2013-citibike-tripdata/12_December/201312-citibike-tripdata_1.csv
2013-citibike-tripdata/11_November/201311-citibike-tripdata_1.csv
2013-citibike-tripdata/7_July/201307-citibike-tripdata_1.csv
20

In [47]:
for i in range(0, 12):
    load_zip_csv_offline('tripdata', f'{2013 + i}-citibike-tripdata.zip')

2013-citibike-tripdata/201309-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201309-citibike-tripdata.csv
2013-citibike-tripdata/201311-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201311-citibike-tripdata.csv
2013-citibike-tripdata/201307-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201307-citibike-tripdata.csv
2013-citibike-tripdata/201308-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201308-citibike-tripdata.csv
2013-citibike-tripdata/201306-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201306-citibike-tripdata.csv
2013-citibike-tripdata/201310-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201310-citibike-tripdata.csv
2013-citibike-tripdata/201312-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201312-citibike-tripdata.csv
2013-citibike-tripdata/12_December/201312-citibike-tripdata_1.csv
2013-citibike-tripdata/11_November/201311-citibike-tripdata_1.csv
2013-citibike-tripdata/7_July/201307-citibike-tripdata_1.csv
20

FileNotFoundError: [Errno 2] No such file or directory: '../data/tripdata/2024-citibike-tripdata.zip'