In [1]:
# import
import pandas as pd
import numpy as np

In [2]:
# load data
start = "../"
# start = "/mnt/c/Users/dalli/source/acme_senior/projectV3/"
train = pd.read_csv(start + 'data/train.csv')
test = pd.read_csv(start + 'data/test.csv')

# show the first and last times in the test set pickup_datetime
# concert the pickup_datetime column to a datetime object
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train["dropoff_datetime"] = pd.to_datetime(train["dropoff_datetime"])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

print("Date Range:")
print(train['pickup_datetime'].min())
print(train['pickup_datetime'].max())

print("\nGrid Range:")
min_lat = min(train["pickup_latitude"].min(), train['dropoff_latitude'].min())
min_long = min(train["pickup_longitude"].min(), train['dropoff_longitude'].min())
max_lat = max(train["pickup_latitude"].max(), train['dropoff_latitude'].max())
max_long = max(train["pickup_longitude"].max(), train['dropoff_longitude'].max())
print(min_lat, min_long)
print(max_lat, max_long)

# drop extra columns
train = train.drop(['id', 'vendor_id', "store_and_fwd_flag", 'dropoff_datetime'], axis=1)
test = test.drop(['id', 'vendor_id', "store_and_fwd_flag"], axis=1)

# apply the floor("H") operation to every row in test["hour"]
# train['hour'] = train['pickup_datetime'].apply(lambda date: date.floor('H'))
train['hour'] = train['pickup_datetime'].dt.floor('H')
test['hour'] = test['pickup_datetime'].dt.floor('H')

Date Range:
2016-01-01 00:00:17
2016-06-30 23:59:39

Grid Range:
32.1811408996582 -121.93334197998048
51.88108444213867 -61.33552932739258


In [3]:
train.head()

Unnamed: 0,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,hour
0,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.96463,40.765602,455,2016-03-14 17:00:00
1,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,663,2016-06-12 00:00:00
2,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,2124,2016-01-19 11:00:00
3,2016-04-06 19:32:31,1,-74.01004,40.719971,-74.012268,40.706718,429,2016-04-06 19:00:00
4,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.78252,435,2016-03-26 13:00:00


In [4]:
# load weather data
# https://www.kaggle.com/datasets/aadimator/nyc-weather-2016-to-2022
weather = pd.read_csv(start + 'data/weather.csv')
weather = weather.dropna()

# drop all rows where the time is after 2016-07-01
weather['time'] = pd.to_datetime(weather['time'])
weather = weather[weather['time'] <= '2016-07-01']

# drop extraneous columns and show
weather = weather.drop(columns=['rain (mm)', 'cloudcover_low (%)', 'cloudcover_mid (%)', 'cloudcover_high (%)', 'windspeed_10m (km/h)', 'winddirection_10m (°)'])
print(weather['time'].min())
print(weather['time'].max())

2016-01-01 00:00:00
2016-07-01 00:00:00


In [5]:
# left join test and weather on hour == time
taxis = train.merge(weather, left_on='hour', right_on='time', how='left')
taxis = taxis.drop(columns=['time', 'hour'])
validation = test.merge(weather, left_on='hour', right_on='time', how='left')
validation = validation.drop(columns=['time', 'hour'])

In [6]:
taxis.head()

Unnamed: 0,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,temperature_2m (°C),precipitation (mm),cloudcover (%)
0,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.96463,40.765602,455,6.4,0.2,100.0
1,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,663,28.8,0.0,5.0
2,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,2124,-7.6,0.0,0.0
3,2016-04-06 19:32:31,1,-74.01004,40.719971,-74.012268,40.706718,429,7.9,0.0,2.0
4,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.78252,435,4.7,0.0,5.0


In [7]:
taxis.describe()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,temperature_2m (°C),precipitation (mm),cloudcover (%)
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923,10.2968,0.09552324,44.17891
std,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432,9.031819,0.4173544,37.70505
min,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0,-16.8,0.0,0.0
25%,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0,3.2,0.0,7.0
50%,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0,10.3,0.0,33.0
75%,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0,17.1,0.0,85.0
max,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0,30.7,7.9,100.0


In [8]:
taxis.dtypes

pickup_datetime        datetime64[ns]
passenger_count                 int64
pickup_longitude              float64
pickup_latitude               float64
dropoff_longitude             float64
dropoff_latitude              float64
trip_duration                   int64
temperature_2m (°C)           float64
precipitation (mm)            float64
cloudcover (%)                float64
dtype: object

In [9]:
# save the data as a csv
# taxis.to_csv(start + 'data/taxis.csv', index=False)
taxis['pickup_float'] = pd.to_datetime(taxis['pickup_datetime']).astype('int64') // 10**10
taxis.head()

  taxis['pickup_float'] = pd.to_datetime(taxis['pickup_datetime']).astype('int64') // 10**10


Unnamed: 0,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,temperature_2m (°C),precipitation (mm),cloudcover (%),pickup_float
0,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.96463,40.765602,455,6.4,0.2,100.0,145797629
1,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,663,28.8,0.0,5.0,146569221
2,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,2124,-7.6,0.0,0.0,145320332
3,2016-04-06 19:32:31,1,-74.01004,40.719971,-74.012268,40.706718,429,7.9,0.0,2.0,145997115
4,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.78252,435,4.7,0.0,5.0,145899905


In [10]:
taxis['pickup_float'] = (taxis['pickup_datetime'] - taxis['pickup_datetime'].min()) / (taxis['pickup_datetime'].max() - taxis['pickup_datetime'].min())
taxis.head()

Unnamed: 0,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,temperature_2m (°C),precipitation (mm),cloudcover (%),pickup_float
0,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.96463,40.765602,455,6.4,0.2,100.0,0.405086
1,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,663,28.8,0.0,5.0,0.895772
2,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,2124,-7.6,0.0,0.0,0.101554
3,2016-04-06 19:32:31,1,-74.01004,40.719971,-74.012268,40.706718,429,7.9,0.0,2.0,0.531947
4,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.78252,435,4.7,0.0,5.0,0.470127


In [11]:
# convert datetimes to ints and normalize
# taxis['pickup_float'] = pd.to_datetime(taxis['pickup_datetime']).astype('int64') // 10**9
# taxis['pickup_float'] = (taxis['pickup_datetime'] - taxis['pickup_datetime'].min()) / (taxis['pickup_datetime'].max() - taxis['pickup_datetime'].min())

# make train and label data
train_data = taxis.drop(columns=['trip_duration', 'pickup_datetime'])
train_labels = taxis['trip_duration']

taxis.head()

Unnamed: 0,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,temperature_2m (°C),precipitation (mm),cloudcover (%),pickup_float
0,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.96463,40.765602,455,6.4,0.2,100.0,0.405086
1,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,663,28.8,0.0,5.0,0.895772
2,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,2124,-7.6,0.0,0.0,0.101554
3,2016-04-06 19:32:31,1,-74.01004,40.719971,-74.012268,40.706718,429,7.9,0.0,2.0,0.531947
4,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.78252,435,4.7,0.0,5.0,0.470127


In [12]:
# make train and test splits
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# make a random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=40, 
                            max_depth=5, 
                            max_leaf_nodes=5, 
                            random_state=0, 
                            verbose=4, 
                            max_samples=0.1, 
                            n_jobs=-2)
rf.fit(train_data, train_labels)

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.


building tree 1 of 40building tree 2 of 40

building tree 3 of 40
building tree 4 of 40
building tree 5 of 40
building tree 6 of 40
building tree 7 of 40
building tree 8 of 40
building tree 9 of 40
building tree 10 of 40
building tree 11 of 40
building tree 12 of 40
building tree 13 of 40
building tree 14 of 40
building tree 15 of 40
building tree 16 of 40
building tree 17 of 40
building tree 18 of 40


[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:   30.0s


building tree 19 of 40
building tree 20 of 40
building tree 21 of 40
building tree 22 of 40
building tree 23 of 40
building tree 24 of 40
building tree 25 of 40
building tree 26 of 40
building tree 27 of 40
building tree 28 of 40
building tree 29 of 40
building tree 30 of 40
building tree 31 of 40
building tree 32 of 40
building tree 33 of 40
building tree 34 of 40
building tree 35 of 40
building tree 36 of 40
building tree 37 of 40
building tree 38 of 40
building tree 39 of 40
building tree 40 of 40


[Parallel(n_jobs=-2)]: Done  38 out of  40 | elapsed:  1.6min remaining:    4.9s
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:  1.6min finished


TypeError: predict() takes 2 positional arguments but 3 were given

In [18]:
# perform validation
# predicted_labels = rf.predict(test_data)
# print results
# get 10% of test data and test labels
small_test_data = test_data[:int(len(test_data) * 0.1)]
small_test_labels = test_labels[:int(len(test_labels) * 0.1)]
print("Accuracy:", rf.score(small_test_data, small_test_labels))

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
