In [1]:
!pip install pyarrow # read parquet files



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

In [4]:
#read in datafile

df_train = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
df_val = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

## Q1. Read the data for January. How many columns are there?

In [5]:
df_train.shape

(3066766, 19)

## Q2. What's the standard deviation of the trips duration in January?

In [6]:
# Create variable duration for total time of trip for Jan 2023 trips (df_train)

df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
# Create variable duration for total time of trip for Feb 2023 trips (df_val)

df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

In [8]:
# View distribution of Jan 2023 duration of a ride in minutes
df_train.duration.describe().round(2)

count    3066766.00
mean          15.67
std           42.59
min          -29.20
25%            7.12
50%           11.52
75%           18.30
max        10029.18
Name: duration, dtype: float64

## Q3. Keep only records where duration was between 1 and 60 minutes. What fraction of the records left after you dropped the outliers?

In [9]:
#Percent of Jan 2023 trips between 1 and 60 minutes
((df_train.duration >=1) & (df_train.duration <= 60)).mean()

0.9812202822125979

In [10]:
# Subset trips between 1 and 60 minutes
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

## Q4. Apply one-hot encoding. What's the dimensionality of this matrix (number of columns)?

In [11]:
# Define categorical variables and make values strings
categorical = ["PULocationID", "DOLocationID"]
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

In [12]:
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [13]:
# What's the dimensionality of this matrix (number of columns)?
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

## Q5. Use the feature matrix from the previous step to train a model. What's the RMSE on train?

In [14]:
#Train a plain linear regression model with default parameters

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [15]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

In [16]:
# Calculate the RMSE of the model on the training data
mse = mean_squared_error(y_train, y_pred, squared=False)
print('MSE: %f' % mse)

MSE: 7.649261


## Q6. Apply this model to the validation dataset (February 2023). What's the RMSE on validation?

In [17]:
lr = LinearRegression()
lr.fit(X_val, y_val)
y_pred = lr.predict(X_val)

In [18]:
# Calculate the RMSE of the model on the validation data
mse = mean_squared_error(y_val, y_pred, squared=False)
print('MSE: %f' % mse)

MSE: 7.779036
