In [31]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nyc-trips-jan-feb-2023/yellow_tripdata_2023-01.parquet
/kaggle/input/nyc-trips-jan-feb-2023/yellow_tripdata_2023-02.parquet


The goal of this homework is to train a simple model for predicting the duration of a ride.

We'll use the NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "Yellow Taxi Trip Records".

We 'll download the data for January and February 2023.

In [32]:
import pyarrow.parquet as pq
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_extraction import DictVectorizer

In [33]:
trips_jan = pq.read_table('/kaggle/input/nyc-trips-jan-feb-2023/yellow_tripdata_2023-01.parquet')
trips_jan = trips_jan.to_pandas()
trips_jan.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [34]:
trips_jan.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

# # Q1. Downloading the data

In [35]:
len(trips_jan.columns)

19

# # Q2. Computing duration

In [36]:
# Compute duration of each ride in minutes
trips_jan['duration'] = (trips_jan['tpep_dropoff_datetime'] - trips_jan['tpep_pickup_datetime']).dt.total_seconds() / 60

# Calculate the standard deviation of trip durations
std_dev_duration = trips_jan['duration'].std()

print("Standard deviation of trip durations:", round(std_dev_duration,2), "minutes")

Standard deviation of trip durations: 42.59 minutes


# # Q3. Dropping outliers

In [37]:
# Filter out records with duration between 1 and 60 minutes
filtered_trips = trips_jan[(trips_jan['duration'] >= 1) & (trips_jan['duration'] <= 60)]

# Calculate the fraction of records remaining
fraction_remaining = len(filtered_trips) / len(trips_jan)

print("Fraction of records remaining after dropping outliers:", round(fraction_remaining*100,0), "%")

Fraction of records remaining after dropping outliers: 98.0 %


# # Q4. One-hot encoding

In [None]:
categorical = ['PULocationID', 'DOLocationID']

# Convert columns to strings
filtered_trips.loc[:, categorical] = filtered_trips[categorical].astype(str)

# Convert dataframe to a list of dictionaries
data_dicts = filtered_trips[['PULocationID', 'DOLocationID']].to_dict(orient='records')

In [39]:
data_dicts[:5]

[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'}]

In [40]:
# Fit a dictionary vectorizer
vectorizer = DictVectorizer(sparse=False)
vectorizer.fit(data_dicts)

# Get feature matrix
feature_matrix = vectorizer.transform(data_dicts)

# Dimensionality of the feature matrix (number of columns)
num_columns = feature_matrix.shape[1]

print("Dimensionality of the feature matrix (number of columns):", num_columns)

Dimensionality of the feature matrix (number of columns): 515


# # Q5. Training a model

In [41]:
# Assign 'duration' as target variable
target = filtered_trips['duration']

# Initialize a linear regression model
model = LinearRegression()

In [42]:
# Train the model on the feature matrix and target variable
model.fit(feature_matrix, target)

In [43]:
# Predict on the training data
predictions_train = model.predict(feature_matrix)

In [44]:
# Calculate RMSE on the training data
rmse_train = root_mean_squared_error(target, predictions_train)

print("RMSE on train:", round(rmse_train,2))

RMSE on train: 7.65


# # Q6. Evaluating the model

In [46]:
# Load the validation dataset
df_val = pd.read_parquet('/kaggle/input/nyc-trips-jan-feb-2023/yellow_tripdata_2023-02.parquet')

In [None]:
# Compute duration of each ride in minutes
df_val['duration'] = (df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']).dt.total_seconds() / 60

# Filter out records with duration between 1 and 60 minutes
filtered_val = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]

# Convert columns to object type
filtered_val.loc[:, 'PULocationID'] = filtered_val['PULocationID'].astype(object)
filtered_val.loc[:, 'DOLocationID'] = filtered_val['DOLocationID'].astype(object)

# Then convert to strings
filtered_val.loc[:, 'PULocationID'] = filtered_val['PULocationID'].astype(str)
filtered_val.loc[:, 'DOLocationID'] = filtered_val['DOLocationID'].astype(str)

# Convert dataframe to a list of dictionaries
data_dicts_val = filtered_val[['PULocationID', 'DOLocationID']].to_dict(orient='records')

In [48]:
# Get feature matrices for validation data
feature_matrix_val = vectorizer.transform(data_dicts_val)

In [50]:
# Assign 'duration' as target variable
target_val = filtered_val['duration']

# Train the model on the validation data
model.fit(feature_matrix_val, target_val)

# Predict on the validation data
predictions_val = model.predict(feature_matrix_val)

# Calculate RMSE on the validation data
rmse_val = root_mean_squared_error(target_val, predictions_val)
print("RMSE on validation:", round(rmse_val,2))

RMSE on validation: 7.78
