#### Import necessary libraries

In [116]:
import requests
import json
import pandas as pd

#### Add the URL we want to make the request to

In [117]:
url = 'https://data.cityofnewyork.us/resource/m6nq-qud6.json'

#### Set the limit to retrieve the first record

In [118]:
limit = 1
offset = 0
params = {
    '$limit': limit,
    '$offset': offset
}

 #### Receive the status code for our request

In [119]:
response = requests.get(url, params=params)
response.status_code  

200

#### Read the data we received back from the API

In [120]:
response.json()

[{'vendorid': '1',
  'tpep_pickup_datetime': '2021-01-01T00:30:10.000',
  'tpep_dropoff_datetime': '2021-01-01T00:36:12.000',
  'passenger_count': '1',
  'trip_distance': '2.10',
  'ratecodeid': '1',
  'store_and_fwd_flag': 'N',
  'pulocationid': '142',
  'dolocationid': '43',
  'payment_type': '2',
  'fare_amount': '8',
  'extra': '3',
  'mta_tax': '0.5',
  'tip_amount': '0',
  'tolls_amount': '0',
  'improvement_surcharge': '0.3',
  'total_amount': '11.8',
  'congestion_surcharge': '2.5'}]

#### Set query parameters for pagination and field selection. Фt the end, print it out the DataFrame

In [121]:
limit = 100000  # Number of records per page. Adjust the limit as needed
offset = 0
data = []

while True:    
    params = {
        '$limit': limit,
        '$offset': offset,
        '$select': 'tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,payment_type,total_amount',
        '$where': 'total_amount >= 50'
    }    
   
    response = requests.get(url, params=params)    
    
    if response.status_code == 200:
        
        # Load the data as a JSON object
        batch_data = response.json()

        # If batch_data is empty, break the loop
        if not batch_data:
            print('All data retrieved')
            break

        # Extend the data list with the batch_data
        data.extend(batch_data)

        # Update the offset for the next batch
        offset += limit

    else:
        print(f"Request failed with status code {response.status_code}")
        break

# Create a pandas DataFrame from the JSON data
df = pd.DataFrame(data)

# Save the DataFrame as a CSV file
df.to_csv('ny_taxi.csv', index=False)

print("CSV file created successfully")

df

All data retrieved
CSV file created successfully


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,payment_type,total_amount
0,2021-01-01T01:48:35.000,2021-01-01T02:26:28.000,1,0.00,1,50
1,2021-01-01T19:26:18.000,2021-01-01T19:50:57.000,4,15.80,1,50
2,2021-01-01T21:18:36.000,2021-01-01T21:42:55.000,2,16.09,1,50
3,2021-01-02T11:25:11.000,2021-01-02T11:43:54.000,1,11.16,1,50
4,2021-01-02T15:52:17.000,2021-01-02T16:19:35.000,1,10.50,1,50
...,...,...,...,...,...,...
1673894,2021-02-04T17:25:03.000,2021-02-04T17:47:01.000,1,2.62,2,6969.3
1673895,2021-01-04T16:04:51.000,2021-01-04T16:15:01.000,1,2.05,1,7661.28
1673896,2021-04-10T13:14:49.000,2021-04-10T13:50:53.000,1,5.70,3,395854.74
1673897,2021-03-18T12:10:41.000,2021-03-18T12:20:03.000,1,0.00,3,398469.2


In [122]:
# Save the batch_data to a file
with open('batch_data.json', 'w') as outfile:
    json.dump(batch_data, outfile)

print('Batch data saved to file.')

Batch data saved to file.


#### Count the number of rows

In [123]:
len(df)

1673899

#### Convert the timestamp string to a datetime object

In [124]:
# Convert the timestamp string to a datetime object
df['pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

#### Extract the date and time components and create new columns

In [125]:
df['pickup_date'] = df['pickup_datetime'].dt.date
df['pickup_time'] = df['pickup_datetime'].dt.time

df['dropoff_date'] = df['dropoff_datetime'].dt.date
df['dropoff_time'] = df['dropoff_datetime'].dt.time

df

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,payment_type,total_amount,pickup_datetime,dropoff_datetime,pickup_date,pickup_time,dropoff_date,dropoff_time
0,2021-01-01T01:48:35.000,2021-01-01T02:26:28.000,1,0.00,1,50,2021-01-01 01:48:35,2021-01-01 02:26:28,2021-01-01,01:48:35,2021-01-01,02:26:28
1,2021-01-01T19:26:18.000,2021-01-01T19:50:57.000,4,15.80,1,50,2021-01-01 19:26:18,2021-01-01 19:50:57,2021-01-01,19:26:18,2021-01-01,19:50:57
2,2021-01-01T21:18:36.000,2021-01-01T21:42:55.000,2,16.09,1,50,2021-01-01 21:18:36,2021-01-01 21:42:55,2021-01-01,21:18:36,2021-01-01,21:42:55
3,2021-01-02T11:25:11.000,2021-01-02T11:43:54.000,1,11.16,1,50,2021-01-02 11:25:11,2021-01-02 11:43:54,2021-01-02,11:25:11,2021-01-02,11:43:54
4,2021-01-02T15:52:17.000,2021-01-02T16:19:35.000,1,10.50,1,50,2021-01-02 15:52:17,2021-01-02 16:19:35,2021-01-02,15:52:17,2021-01-02,16:19:35
...,...,...,...,...,...,...,...,...,...,...,...,...
1673894,2021-02-04T17:25:03.000,2021-02-04T17:47:01.000,1,2.62,2,6969.3,2021-02-04 17:25:03,2021-02-04 17:47:01,2021-02-04,17:25:03,2021-02-04,17:47:01
1673895,2021-01-04T16:04:51.000,2021-01-04T16:15:01.000,1,2.05,1,7661.28,2021-01-04 16:04:51,2021-01-04 16:15:01,2021-01-04,16:04:51,2021-01-04,16:15:01
1673896,2021-04-10T13:14:49.000,2021-04-10T13:50:53.000,1,5.70,3,395854.74,2021-04-10 13:14:49,2021-04-10 13:50:53,2021-04-10,13:14:49,2021-04-10,13:50:53
1673897,2021-03-18T12:10:41.000,2021-03-18T12:20:03.000,1,0.00,3,398469.2,2021-03-18 12:10:41,2021-03-18 12:20:03,2021-03-18,12:10:41,2021-03-18,12:20:03


#### Delete unnecessary columns

In [126]:
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'pickup_datetime', 'dropoff_datetime'], axis=1)
df

Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount,pickup_date,pickup_time,dropoff_date,dropoff_time
0,1,0.00,1,50,2021-01-01,01:48:35,2021-01-01,02:26:28
1,4,15.80,1,50,2021-01-01,19:26:18,2021-01-01,19:50:57
2,2,16.09,1,50,2021-01-01,21:18:36,2021-01-01,21:42:55
3,1,11.16,1,50,2021-01-02,11:25:11,2021-01-02,11:43:54
4,1,10.50,1,50,2021-01-02,15:52:17,2021-01-02,16:19:35
...,...,...,...,...,...,...,...,...
1673894,1,2.62,2,6969.3,2021-02-04,17:25:03,2021-02-04,17:47:01
1673895,1,2.05,1,7661.28,2021-01-04,16:04:51,2021-01-04,16:15:01
1673896,1,5.70,3,395854.74,2021-04-10,13:14:49,2021-04-10,13:50:53
1673897,1,0.00,3,398469.2,2021-03-18,12:10:41,2021-03-18,12:20:03


#### Find and count rows with null values

In [127]:
num_rows_with_null = df.isnull().any(axis=1).sum()

print(f'Rows with null values: {num_rows_with_null}')

Rows with null values: 221318


#### Delete rows with null values

In [128]:
df.dropna(inplace=True)
num_rows_with_null = df.isnull().any(axis=1).sum()

print(f'Rows with null values: {num_rows_with_null}')

Rows with null values: 0


#### Create 'trip_id' column with unique key

In [129]:
import string
import secrets

def generate_trip_id():
    alphabet = string.ascii_letters + string.digits
    return ''.join(secrets.choice(alphabet) for i in range(6))

# Add a new column with unique random keys for each row
df['trip_id'] = [generate_trip_id() for _ in range(len(df))]
df

Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount,pickup_date,pickup_time,dropoff_date,dropoff_time,trip_id
0,1,0.00,1,50,2021-01-01,01:48:35,2021-01-01,02:26:28,hDUNIP
1,4,15.80,1,50,2021-01-01,19:26:18,2021-01-01,19:50:57,uNA2yV
2,2,16.09,1,50,2021-01-01,21:18:36,2021-01-01,21:42:55,8VarYE
3,1,11.16,1,50,2021-01-02,11:25:11,2021-01-02,11:43:54,aicc8P
4,1,10.50,1,50,2021-01-02,15:52:17,2021-01-02,16:19:35,5gYznc
...,...,...,...,...,...,...,...,...,...
1673894,1,2.62,2,6969.3,2021-02-04,17:25:03,2021-02-04,17:47:01,fK0m3W
1673895,1,2.05,1,7661.28,2021-01-04,16:04:51,2021-01-04,16:15:01,2hVpBB
1673896,1,5.70,3,395854.74,2021-04-10,13:14:49,2021-04-10,13:50:53,AMxo6X
1673897,1,0.00,3,398469.2,2021-03-18,12:10:41,2021-03-18,12:20:03,yrkJTN


#### Rearrange columns

In [131]:
df = df[['trip_id', 'pickup_date', 'pickup_time', 'dropoff_date', 'dropoff_time', 'passenger_count', 'trip_distance', 'payment_type', 'total_amount']]
df

Unnamed: 0,trip_id,pickup_date,pickup_time,dropoff_date,dropoff_time,passenger_count,trip_distance,payment_type,total_amount
0,hDUNIP,2021-01-01,01:48:35,2021-01-01,02:26:28,1,0.00,1,50
1,uNA2yV,2021-01-01,19:26:18,2021-01-01,19:50:57,4,15.80,1,50
2,8VarYE,2021-01-01,21:18:36,2021-01-01,21:42:55,2,16.09,1,50
3,aicc8P,2021-01-02,11:25:11,2021-01-02,11:43:54,1,11.16,1,50
4,5gYznc,2021-01-02,15:52:17,2021-01-02,16:19:35,1,10.50,1,50
...,...,...,...,...,...,...,...,...,...
1673894,fK0m3W,2021-02-04,17:25:03,2021-02-04,17:47:01,1,2.62,2,6969.3
1673895,2hVpBB,2021-01-04,16:04:51,2021-01-04,16:15:01,1,2.05,1,7661.28
1673896,AMxo6X,2021-04-10,13:14:49,2021-04-10,13:50:53,1,5.70,3,395854.74
1673897,yrkJTN,2021-03-18,12:10:41,2021-03-18,12:20:03,1,0.00,3,398469.2


#### Save clean data to the final CSV file

In [136]:
df.to_csv('ny_taxi_clean_data.csv', index=False)
print('ny_taxi_clean_data.csv is saved')

ny_taxi_clean_data.csv is saved
