## Import libraries

In [1]:
import pandas as pd 
import numpy as np
import os 
import requests
import json
from datetime import datetime

## Join Files 

In [2]:
# Create a list with all files in the folder using a list comprehension

folderpath = r"C:\Users\Windows\NYC_bike_sharing\Original Data" 
filepaths  = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]

In [3]:
# verifying the filepaths
print("First three file paths:")
for filepath in filepaths[:3]:
    print(filepath)

First three file paths:
C:\Users\Windows\NYC_bike_sharing\Original Data\202201-citibike-tripdata_1.csv
C:\Users\Windows\NYC_bike_sharing\Original Data\202201-citibike-tripdata_2.csv
C:\Users\Windows\NYC_bike_sharing\Original Data\202202-citibike-tripdata_1.csv


## Concatenate bike data for the first 6 months of 2022

In [4]:
# Function to read and clean CSV files
def read_and_clean_csv(filepath):
    try:
        # Read the CSV file
        df = pd.read_csv(filepath)
        # Replace \n characters with spaces in all string columns
        df = df.replace({r'\n': ''}, regex=True)
        return df
    except pd.errors.ParserError as e:
        print(f"Error reading {filepath}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

# Concatenate the first 16 files into one DataFrame
dfs = [read_and_clean_csv(f) for f in filepaths[:16]]
df = pd.concat(dfs, ignore_index=True)

df.head(3)



  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,55262E4365A955A2,classic_bike,2022-01-18 08:23:52,2022-01-18 08:28:18,Boerum Pl\t& Pacific St,4488.09,Clinton St & Joralemon St,4605.04,40.688489,-73.99116,40.692395,-73.993379,member
1,D272F1B15D841EC0,classic_bike,2022-01-21 09:03:22,2022-01-21 09:05:44,E 12 St & Ave C,5616.08,E 10 St & Avenue A,5659.05,40.727243,-73.976831,40.727408,-73.98142,member
2,D1FCEF55EB4A807F,classic_bike,2022-01-22 14:28:32,2022-01-22 14:53:18,W 21 St & 6 Ave,6140.05,W 44 St & 11 Ave,6756.05,40.74174,-73.994156,40.762009,-73.996975,member


# Explanation of the Code for Joining the File Paths and Concatenating the First 6 months of NYC bike sharing datasets

## How Joining Files Works
#### To read all the files at once in JupyterLab, list comprehension was used to make a list of files in the folder path. The logic is as follows:

#### folderpath = r"C:\Users\Windows\NYC_bike_sharing\Original Data" (This is the file path where the files of the bike data are stored for the first 6 months of 2022.

#### filepaths  = [os.path.join(folderpath, name) for name in os.listdir(folderpath)] (This code uses list comprehension to to create the list of file paths containing the paths of all the files  in the folder "C:\Users\Windows\NYC_bike_sharing\Original Data"

#### The string os.listdir(folderpath) is a method that returns a list of all files and directories in the folderpath directory. 
#### This returns the list of files (all 16 csv files of the bike data). The code "for name in os.listdir(folderpath)" loops through each name in the list returned by os.listdir(folderpath). 

#### The code "os.path.join (folderpath, name)" is used to join the folderpath and name together to create a full file path.
h.

## How Concatenating the Datasets Works
####  def read_and_clean_csv(filepath) (This function is used to clean the dataset by using a regular expression to find newline characters in all string columns and replacing them with spaces, effectively removing them. The 'try-except' block is used to handle exceptions that might occur while reading the csv file. It helps in catching and managing errors.) 
#### dfs = [read_and_clean_csv(f) for f in filepaths[:16]] (This line uses a list comprehension to apply the read_and_clean_csv function to each of the first 16 file paths for 16 csv files for the 6 months of data in 2022 in the filepaths list. It creates a list of DataFrames (dfs), each corresponding to one of the cleaned CSV files.)
#### df = pd.concat(dfs, ignore_index=True)  (This is also known as a generator. This line concatenates all the DataFrames in the dfs list into a single DataFrame named df. The ignore_index=True parameter ensures that the index of the resulting DataFrame is reset, creating a new continuous index for the combined DataFrame.)

In [5]:
# Exporting bike data as csv
path = r'C:\Users\Windows\NYC_bike_sharing\Project Files'

In [6]:
df.to_csv(os.path.join(path, 'bike_data_2022_half.csv'))

## Getting Weather Data from NOAA's API

In [7]:
# Define token -

Token = 'TKmtmOYRCjDxWomRfIwtehlfnhRoeWJk'

In [8]:
# Get the API 

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-06-30', headers={'token':Token})

In [9]:
# Load the api response as a json

d = json.loads(r.text)

In [10]:
# Check if there is any data
if d:
    print("Data is present.")
else:
    print("Data is empty or None.")

Data is present.


In [11]:
# Secure all items in the response that correspond to TAVG

avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']

In [12]:
# Get only the date field from all average temperature readings

dates_temp = [item['date'] for item in avg_temps]

In [13]:
# Get the temperature from all average temperature readings

temps = [item['value'] for item in avg_temps]

In [14]:
temps[:3]

[116, 114, 14]

In [15]:
# Put the results in a dataframe

df_temps = pd.DataFrame()

In [16]:
# Get only date and cast it to date time; convert temperature from tenths of Celsius to normal Celsius

df_temps['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temps['average_temp'] = [float(v)/10.0 for v in temps]

In [17]:
df_temps.head(3)

Unnamed: 0,date,average_temp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4


In [18]:
# export temperature data as csv
df_temps.to_csv(os.path.join(path, 'laguardia_temps_2022_first_half.csv'))

In [19]:
# checking data types in the temperate dataset
df_temps.dtypes

date            datetime64[ns]
average_temp           float64
dtype: object

## Cleaning the Bike Sharing Datasets 

In [20]:
# checking data types in the bike dataset
df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [22]:
# convert the variable "started_at" to the datetime data type. 
df['started_at'] = pd.to_datetime(df['started_at'], dayfirst = True)

  df['started_at'] = pd.to_datetime(df['started_at'], dayfirst = True)


In [23]:
# convert the variable "ended_at" to the datetime data type. 
df['ended_at'] = pd.to_datetime(df['ended_at'], dayfirst = True)

  df['ended_at'] = pd.to_datetime(df['ended_at'], dayfirst = True)


In [24]:
# re-verify data types 
df.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object

In [26]:
# Extract the date from the column "started_at" to create a new "date" column, which will be the common key between the bike data set and the weather data
df['date'] = pd.to_datetime(df['started_at'], format='%Y-%m-%d').dt.date

In [28]:
# verify that the "date'column has been added.
df.head(3)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date
0,55262E4365A955A2,classic_bike,2022-01-18 08:23:52,2022-01-18 08:28:18,Boerum Pl\t& Pacific St,4488.09,Clinton St & Joralemon St,4605.04,40.688489,-73.99116,40.692395,-73.993379,member,2022-01-18
1,D272F1B15D841EC0,classic_bike,2022-01-21 09:03:22,2022-01-21 09:05:44,E 12 St & Ave C,5616.08,E 10 St & Avenue A,5659.05,40.727243,-73.976831,40.727408,-73.98142,member,2022-01-21
2,D1FCEF55EB4A807F,classic_bike,2022-01-22 14:28:32,2022-01-22 14:53:18,W 21 St & 6 Ave,6140.05,W 44 St & 11 Ave,6756.05,40.74174,-73.994156,40.762009,-73.996975,member,2022-01-22


In [31]:
# Ensure that the data type of "date" column matches that of the "date" column in df_temps
df['date'] = pd.to_datetime(df['date'])

In [32]:
df.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
date                  datetime64[ns]
dtype: object

## Merging the Bike Sharing Dataset with the Temperature Dataset 

In [33]:
# Merging the bike dataset with weather data (df_temps)
%%time
df_merged = df.merge(df_temps, how = 'left', on = 'date', indicator = True) 

CPU times: total: 48.2 s
Wall time: 1min 26s


In [34]:
# checking to see if there is 100% match between the datasets
df_merged['_merge'].value_counts(dropna = False)

_merge
both          12872672
left_only            0
right_only           0
Name: count, dtype: int64

In [36]:
# check the merged dataset
df_merged.head(3)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,average_temp,_merge
0,55262E4365A955A2,classic_bike,2022-01-18 08:23:52,2022-01-18 08:28:18,Boerum Pl\t& Pacific St,4488.09,Clinton St & Joralemon St,4605.04,40.688489,-73.99116,40.692395,-73.993379,member,2022-01-18,1.8,both
1,D272F1B15D841EC0,classic_bike,2022-01-21 09:03:22,2022-01-21 09:05:44,E 12 St & Ave C,5616.08,E 10 St & Avenue A,5659.05,40.727243,-73.976831,40.727408,-73.98142,member,2022-01-21,-6.0,both
2,D1FCEF55EB4A807F,classic_bike,2022-01-22 14:28:32,2022-01-22 14:53:18,W 21 St & 6 Ave,6140.05,W 44 St & 11 Ave,6756.05,40.74174,-73.994156,40.762009,-73.996975,member,2022-01-22,-5.9,both


In [37]:
# Export df_merged as a csv 
df_merged.to_csv(os.path.join(path, 'merged_bike_temps_2022_half.csv'))