In [1]:
# Create a Python Script to Loop through all of the CSVs, add Month, Year and combine into one CSV for Tableau Upload
# Import Dependancies
import os
import pandas as pd
import glob
import re

In [2]:
# Path where your CSV files are located
folder_path = '2022-citibike-tripdata'
folder_path

'2022-citibike-tripdata'

In [3]:
# Pattern to extract month and year from the filename.
filename_pattern = r'^(\d{6})-citibike-tripdata_\d+\.csv$'

In [4]:
# List to hold all DataFrames
all_dataframes = []

In [5]:
# Debug: Ensure the folder path is correct
print(f"Folder path being used: {folder_path}")

# Fetch all CSV files from the folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Check if any files were found
if not csv_files:
    print(f"No CSV files found in {folder_path}.")
else:
    print(f"Found {len(csv_files)} CSV files.")

# List to hold all DataFrames
all_dataframes = []

for csv_file in csv_files:
    # Extract filename
    filename = os.path.basename(csv_file)
    print(f"Processing file: {filename}")

    # Use regex to extract the month and year from the filename
    match = re.match(filename_pattern, filename)
    
    if match:
        year_month = match.group(1)  # e.g., "202201"
        year = year_month[:4]  # First 4 digits (2022)
        month = year_month[4:]  # Last 2 digits (01)
        
        print(f"Extracted Year: {year}, Month: {month}")
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_file)

        # Debug: Check the DataFrame's shape
        print(f"DataFrame shape after loading: {df.shape}")
        
        if df.empty:
            print(f"Warning: {filename} is empty, skipping.")
        else:
            # Add new columns for Month and Year
            df['Month'] = month
            df['Year'] = year
            all_dataframes.append(df)
            print(f"Added DataFrame for {filename}, shape: {df.shape}")
    else:
        print(f"Filename doesn't match regex: {filename}")

# Check the number of DataFrames to concatenate
print(f"Number of DataFrames to concatenate: {len(all_dataframes)}")

# Concatenate all DataFrames into one
if len(all_dataframes) > 0:
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    print("DataFrames successfully concatenated.")
else:
    print("No DataFrames to concatenate.")


Folder path being used: 2022-citibike-tripdata
Found 36 CSV files.
Processing file: 202201-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 01


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202201-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202201-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 01
DataFrame shape after loading: (24555, 13)
Added DataFrame for 202201-citibike-tripdata_2.csv, shape: (24555, 15)
Processing file: 202202-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 02


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202202-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202202-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 02


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (197312, 13)
Added DataFrame for 202202-citibike-tripdata_2.csv, shape: (197312, 15)
Processing file: 202203-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 03


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202203-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202203-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 03


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (845965, 13)
Added DataFrame for 202203-citibike-tripdata_2.csv, shape: (845965, 15)
Processing file: 202204-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 04


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202204-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202204-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 04


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202204-citibike-tripdata_2.csv, shape: (1000000, 15)
Processing file: 202204-citibike-tripdata_3.csv
Extracted Year: 2022, Month: 04


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (260790, 13)
Added DataFrame for 202204-citibike-tripdata_3.csv, shape: (260790, 15)
Processing file: 202205-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 05


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202205-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202205-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 05


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202205-citibike-tripdata_2.csv, shape: (1000000, 15)
Processing file: 202205-citibike-tripdata_3.csv
Extracted Year: 2022, Month: 05


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (865425, 13)
Added DataFrame for 202205-citibike-tripdata_3.csv, shape: (865425, 15)
Processing file: 202206-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 06


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202206-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202206-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 06


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202206-citibike-tripdata_2.csv, shape: (1000000, 15)
Processing file: 202206-citibike-tripdata_3.csv
Extracted Year: 2022, Month: 06


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202206-citibike-tripdata_3.csv, shape: (1000000, 15)
Processing file: 202206-citibike-tripdata_4.csv
Extracted Year: 2022, Month: 06


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (343914, 13)
Added DataFrame for 202206-citibike-tripdata_4.csv, shape: (343914, 15)
Processing file: 202207-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 07


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202207-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202207-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 07


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202207-citibike-tripdata_2.csv, shape: (1000000, 15)
Processing file: 202207-citibike-tripdata_3.csv
Extracted Year: 2022, Month: 07


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202207-citibike-tripdata_3.csv, shape: (1000000, 15)
Processing file: 202207-citibike-tripdata_4.csv
Extracted Year: 2022, Month: 07


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (397932, 13)
Added DataFrame for 202207-citibike-tripdata_4.csv, shape: (397932, 15)
Processing file: 202208-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 08


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202208-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202208-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 08


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202208-citibike-tripdata_2.csv, shape: (1000000, 15)
Processing file: 202208-citibike-tripdata_3.csv
Extracted Year: 2022, Month: 08


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202208-citibike-tripdata_3.csv, shape: (1000000, 15)
Processing file: 202208-citibike-tripdata_4.csv
Extracted Year: 2022, Month: 08


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (576020, 13)
Added DataFrame for 202208-citibike-tripdata_4.csv, shape: (576020, 15)
Processing file: 202209-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 09


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202209-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202209-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 09


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202209-citibike-tripdata_2.csv, shape: (1000000, 15)
Processing file: 202209-citibike-tripdata_3.csv
Extracted Year: 2022, Month: 09


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202209-citibike-tripdata_3.csv, shape: (1000000, 15)
Processing file: 202209-citibike-tripdata_4.csv
Extracted Year: 2022, Month: 09


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (411866, 13)
Added DataFrame for 202209-citibike-tripdata_4.csv, shape: (411866, 15)
Processing file: 202210-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 10


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202210-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202210-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 10


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202210-citibike-tripdata_2.csv, shape: (1000000, 15)
Processing file: 202210-citibike-tripdata_3.csv
Extracted Year: 2022, Month: 10


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (936584, 13)
Added DataFrame for 202210-citibike-tripdata_3.csv, shape: (936584, 15)
Processing file: 202211-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 11


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202211-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202211-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 11


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202211-citibike-tripdata_2.csv, shape: (1000000, 15)
Processing file: 202211-citibike-tripdata_3.csv
Extracted Year: 2022, Month: 11


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (386394, 13)
Added DataFrame for 202211-citibike-tripdata_3.csv, shape: (386394, 15)
Processing file: 202212-citibike-tripdata_1.csv
Extracted Year: 2022, Month: 12


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (1000000, 13)
Added DataFrame for 202212-citibike-tripdata_1.csv, shape: (1000000, 15)
Processing file: 202212-citibike-tripdata_2.csv
Extracted Year: 2022, Month: 12


  df = pd.read_csv(csv_file)


DataFrame shape after loading: (592049, 13)
Added DataFrame for 202212-citibike-tripdata_2.csv, shape: (592049, 15)
Number of DataFrames to concatenate: 36
DataFrames successfully concatenated.


In [6]:
# Concatenate all DataFrames into one
combined_df = pd.concat(all_dataframes, ignore_index=True)

In [7]:
#view combined df
combined_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,Month,Year
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member,1,2022
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member,1,2022
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831,member,1,2022
3,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783964,-73.947167,40.745168,-73.986831,member,1,2022
4,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,6364.1,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745168,-73.986831,member,1,2022


In [8]:
# Save the combined DataFrame to a new CSV
combined_df.to_csv('data/2022_combined_file.csv', index=False)

print("CSV files have been successfully combined and saved.")

CSV files have been successfully combined and saved.


In [9]:
print(combined_df['Month'].dtype)

object


In [10]:
# Save a CSV File by Quarter for manigable data size
# Step 1: Convert 'Month' column to integers (removes leading zeros if any)
combined_df['Month'] = combined_df['Month'].astype(str).str.lstrip('0').astype(int)



In [11]:
# Filter months to match the desired quarter
Q1_2022 = combined_df[combined_df['Month'].isin([1, 2, 3])]

Q1_2022.head()

# Save the filtered DataFrame to a new CSV
Q1_2022.to_csv('data/Q1_2022_data.csv', index=False)

print("Q1 months data saved to 'Q1_2022_data.csv'.")

Q1 months data saved to 'Q1_2022_data.csv'.


In [12]:
# Filter months to match the desired quarter
Q2_2022 = combined_df[combined_df['Month'].isin([4, 5, 6])]

Q2_2022.head()

# Save the filtered DataFrame to a new CSV
Q2_2022.to_csv('data/Q2_2022_data.csv', index=False)

print("Q2 months data saved to 'Q2_2022_data.csv'.")

Q2 months data saved to 'Q2_2022_data.csv'.


In [13]:
# Filter months to match the desired quarter
Q3_2022 = combined_df[combined_df['Month'].isin([7, 8, 9])]

Q3_2022.head()

# Save the filtered DataFrame to a new CSV
Q3_2022.to_csv('data/Q3_2022_data.csv', index=False)

print("Q3 months data saved to 'Q3_2022_data.csv'.")

Q3 months data saved to 'Q3_2022_data.csv'.


In [14]:
# Filter months to match the desired quarter
Q4_2022 = combined_df[combined_df['Month'].isin([10, 11, 12])]

Q4_2022.head()

# Save the filtered DataFrame to a new CSV
Q4_2022.to_csv('data/Q4_2022_data.csv', index=False)

print("Q4 months data saved to 'Q4_2022_data.csv'.")

Q4 months data saved to 'Q4_2022_data.csv'.
