In [241]:
#Being able to read the parquet data files --> From https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page 

import pyarrow.parquet as pq
trips = pq.read_table("2011_new.parquet")
trips = trips.to_pandas()

In [242]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
from collections import Counter
import time
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [243]:
# Training and test data split: 
# Splitting the dataset into training, validation, and test data set using 60:20:20 split for train: validation: test.

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report

#Checking the available columns in the dataset
trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_date,pickup_time,dropoff_date,dropoff_time
0,142,116,2011-07-01,00:00:00,2011-07-01,00:12:00
1,246,41,2011-07-01,00:00:00,2011-07-01,00:16:00
2,148,243,2011-07-01,00:00:00,2011-07-01,00:50:00
3,107,170,2011-07-01,00:00:00,2011-07-01,00:05:00
4,162,87,2011-07-01,00:00:00,2011-07-01,00:21:00


In [244]:
# X = trips[['PULocationID', 'DOLocationID', 'pickup_date', 'dropoff_date']].values

In [245]:
#Check which unique pickup dates are available in the dataset
#This is correct because the dataset includes 2011-07-1 -> 2011-07-07
unique_dates = trips['pickup_date']
unique_dates.unique()

array([datetime.date(2011, 7, 1), datetime.date(2011, 7, 2),
       datetime.date(2011, 7, 3), datetime.date(2011, 7, 4),
       datetime.date(2011, 7, 5), datetime.date(2011, 7, 6),
       datetime.date(2011, 7, 7)], dtype=object)

In [246]:
# Makes a copy of the original dataframe named trips, so the original dataframe is not adjusted 
trips_copy = trips.copy()

# Replace values in 'pickup_date' and 'dropoff_date' using a dictionary
# Do this using a dictionary because .replace(.......,inplace=True) can not be executed several times for the same dataset --> gives a future warning.
changing_date_to_single_number = {
    '2011-07-01': '1',
    '2011-07-02': '2',
    '2011-07-03': '3',
    '2011-07-04': '4',
    '2011-07-05': '5',
    '2011-07-06': '6',
    '2011-07-07': '7'
}

# Replace values in 'pickup_date' using the same above dictionary
trips_copy['pickup_date'] = trips_copy['pickup_date'].astype(str).replace(changing_date_to_single_number)

# Replace values in 'dropoff_date' using the same above dictionary
trips_copy['dropoff_date'] = trips_copy['dropoff_date'].astype(str).replace(changing_date_to_single_number)


In [247]:
#Checking if the adjustments were done correctly --> seems correct.

# Display the first few rows
# Seems correct as the observations keep the same PULocationID and DOLocationID and the only thing that changes is the pickup date/ dropoff date.
# print(trips.head())
# print(trips_copy.head())

# Seems also correct as the number of observations on each day is the same.
#Original data set
pickup_counts_original = trips['pickup_date'].value_counts()
print(f'Number of pick up counts on each day in original data set {pickup_counts_original}: ')

#Modified data set
pickup_counts_modified_copy = trips_copy['pickup_date'].value_counts()
print(f'Number of pick up counts on each day in original data set {pickup_counts_modified_copy}: ')

Number of pick up counts on each day in original data set pickup_date
2011-07-07    427809
2011-07-06    402064
2011-07-01    393675
2011-07-05    350593
2011-07-02    309025
2011-07-03    267093
2011-07-04    245278
Name: count, dtype: int64: 
Number of pick up counts on each day in original data set pickup_date
7    427809
6    402064
1    393675
5    350593
2    309025
3    267093
4    245278
Name: count, dtype: int64: 


In [253]:
#Now connect the values in the new dataframe (using numbers instead of dates) to value X which is used in the NN model
#pickup_time and dropoff_time is not used yet in this example, if we want to use is, include it in here.
X = trips_copy[['PULocationID', 'DOLocationID', 'pickup_date', 'dropoff_date']].values

In [254]:
#I do not think we have to convert things as we only have numeric values, as the LabelEncoder does not work with the Scaler, we converted everything
# earlier in by changing the dates to numbers. So I do not think we should use this part of the original code. 
# however, I am in doubt because in this part of the code they introduce 'y'. Which is used in a later part of the code. 

# y = trips['pickup_date']
# #label encoding is done as model accepts only numeric values
# # so strings need to be converted into labels
# LE = preprocessing.LabelEncoder()
# LE.fit(y)
# y = LE.transform(y)
# y

In [None]:
#for now y is only pickup_date, do not know for sure if drop_off date is also necessarry.
# I do this because in the original code above they also introduce a variable named y.
# I use trips_copy because in this dataset, the pickup date is already changed from a real date to the single number of the date.

y = trips_copy['pickup_date']

In [None]:
# Copied from the example. Did not change a thing, y is used in this one.

# #Splitting dataset into train, validation and test data
# #De originele dataset bestaat uit features X en labels y. 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0) #80% training, 20% test
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size=0.25,random_state = 0)

# Scale data
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


#Output the number of data points in training, validation, and test dataset.
print("Datapoints in Training set:",len(X_train))
print("Datapoints in validation set:",len(X_val))
print("Datapoints in Test set:",len(X_test))

Datapoints in Training set: 1437321
Datapoints in validation set: 479108
Datapoints in Test set: 479108


!!! I think it is fine untill here. I got some troubles with the one hot encoding. In the example code it is all done in one cell, but i created different parts. If y is defined correclty two cells above I think it is correct but not sure about that. !!!


In [256]:
# In the original code we are at this step, just above Part A. This step needs to be implemented!!!!!


# Part from the original code:
# # Convert the y variable into one-hot encoding - basically the true label will be 1 and all others will be assigned to 0
# def one_hot(y, num_classes):
#     return np.eye(num_classes)[y]

# y_train_oh = one_hot(y_train, len(np.unique(y)))
# y_val_oh = one_hot(y_val, len(np.unique(y)))
# y_test_oh = one_hot(y_test, len(np.unique(y)))


# Now I am not sure which value is our y, I also do not know exactly what to do here.
# Below a piece of code that microsoft copilot wrote for me, fixing the one-hot encoding which should be done in this part of the exercise, 
# but not sure how to work with this.

trips_copy_pickup_date_encoded = pd.get_dummies(trips_copy['pickup_date'], prefix='pickup_date', dtype=int)
print(trips_copy_pickup_date_encoded)

         pickup_date_1  pickup_date_2  pickup_date_3  pickup_date_4  \
0                    1              0              0              0   
1                    1              0              0              0   
2                    1              0              0              0   
3                    1              0              0              0   
4                    1              0              0              0   
...                ...            ...            ...            ...   
2395532              0              0              0              0   
2395533              0              0              0              0   
2395534              0              0              0              0   
2395535              0              0              0              0   
2395536              0              0              0              0   

         pickup_date_5  pickup_date_6  pickup_date_7  
0                    0              0              0  
1                    0              0