In [1]:
import pandas as pd
import numpy as np
import datetime
import sklearn

In [2]:
sklearn.__version__

'1.6.0'

In [3]:
df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet")

In [4]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


### Q1: number of columns?

In [5]:
len(df.columns)

19

In [6]:
type(df["tpep_pickup_datetime"])

pandas.core.series.Series

### Q2: std of duration?

In [7]:
df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]

In [8]:
df["duration"] = df["duration"].dt.total_seconds() / 60

In [9]:
df["duration"].std()

np.float64(34.851053592212814)

### Q3: fraction of records left after filtering outliers?

In [10]:
filter_df = df[df["duration"] < 60]

In [11]:
filter_df.shape[0] * 100 / df.shape[0]

98.96597342529778

### Q4: dimensionality of one-hot encoding matrix?

In [12]:
filter_df.loc[:, "PULocationID"] = filter_df["PULocationID"].astype(str)
filter_df.loc[:, "DOLocationID"] = filter_df["DOLocationID"].astype(str)

  filter_df.loc[:, "PULocationID"] = filter_df["PULocationID"].astype(str)
  filter_df.loc[:, "DOLocationID"] = filter_df["DOLocationID"].astype(str)


In [13]:
feature_df = filter_df[["PULocationID", "DOLocationID"]].drop_duplicates().reset_index(drop=True)

In [14]:
feature_df

Unnamed: 0,PULocationID,DOLocationID
0,186,79
1,140,236
2,236,79
3,79,211
4,211,148
...,...,...
24123,62,180
24124,115,115
24125,79,101
24126,106,13


In [15]:
feature_dictls = feature_df.to_dict('records')

In [16]:
feature_dictls

[{'PULocationID': '186', 'DOLocationID': '79'},
 {'PULocationID': '140', 'DOLocationID': '236'},
 {'PULocationID': '236', 'DOLocationID': '79'},
 {'PULocationID': '79', 'DOLocationID': '211'},
 {'PULocationID': '211', 'DOLocationID': '148'},
 {'PULocationID': '148', 'DOLocationID': '141'},
 {'PULocationID': '138', 'DOLocationID': '181'},
 {'PULocationID': '246', 'DOLocationID': '231'},
 {'PULocationID': '161', 'DOLocationID': '261'},
 {'PULocationID': '113', 'DOLocationID': '113'},
 {'PULocationID': '107', 'DOLocationID': '137'},
 {'PULocationID': '158', 'DOLocationID': '246'},
 {'PULocationID': '246', 'DOLocationID': '190'},
 {'PULocationID': '68', 'DOLocationID': '90'},
 {'PULocationID': '90', 'DOLocationID': '68'},
 {'PULocationID': '132', 'DOLocationID': '216'},
 {'PULocationID': '164', 'DOLocationID': '79'},
 {'PULocationID': '237', 'DOLocationID': '237'},
 {'PULocationID': '141', 'DOLocationID': '263'},
 {'PULocationID': '161', 'DOLocationID': '263'},
 {'PULocationID': '263', 'DO

In [17]:
from sklearn.feature_extraction import DictVectorizer

In [18]:
vec = DictVectorizer()

In [19]:
vec.fit_transform(feature_dictls).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(24128, 520))

In [42]:
vec.get_feature_names_out()

array(['DOLocationID=1', 'DOLocationID=10', 'DOLocationID=100',
       'DOLocationID=101', 'DOLocationID=102', 'DOLocationID=105',
       'DOLocationID=106', 'DOLocationID=107', 'DOLocationID=108',
       'DOLocationID=109', 'DOLocationID=11', 'DOLocationID=111',
       'DOLocationID=112', 'DOLocationID=113', 'DOLocationID=114',
       'DOLocationID=115', 'DOLocationID=116', 'DOLocationID=117',
       'DOLocationID=118', 'DOLocationID=119', 'DOLocationID=12',
       'DOLocationID=120', 'DOLocationID=121', 'DOLocationID=122',
       'DOLocationID=123', 'DOLocationID=124', 'DOLocationID=125',
       'DOLocationID=126', 'DOLocationID=127', 'DOLocationID=128',
       'DOLocationID=129', 'DOLocationID=13', 'DOLocationID=130',
       'DOLocationID=131', 'DOLocationID=132', 'DOLocationID=133',
       'DOLocationID=134', 'DOLocationID=135', 'DOLocationID=136',
       'DOLocationID=137', 'DOLocationID=138', 'DOLocationID=139',
       'DOLocationID=14', 'DOLocationID=140', 'DOLocationID=141',
  

520

Get feature matrix

In [21]:
full_data_dicts = filter_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

In [22]:
feature_matrix = vec.transform(full_data_dicts)

### Q5: train a model and get RMSE

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [24]:
duration = filter_df["duration"].to_numpy()

In [25]:
reg = LinearRegression().fit(feature_matrix, duration)

In [26]:
predictions = reg.predict(feature_matrix)

In [27]:
mse = mean_squared_error(duration, predictions)

In [28]:
rmse = np.sqrt(mse)

In [29]:
rmse

np.float64(8.24137308767574)

### Q6: apply model to validation set and get score

In [30]:
val_df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet")

In [31]:
val_df["duration"] = val_df["tpep_dropoff_datetime"] - val_df["tpep_pickup_datetime"]

In [32]:
val_df["duration"] = val_df["duration"].dt.total_seconds() / 60

In [33]:
filter_val_df = val_df[val_df["duration"] < 60]

In [34]:
filter_val_df.loc[:, "PULocationID"] = filter_val_df["PULocationID"].astype(str)
filter_val_df.loc[:, "DOLocationID"] = filter_val_df["DOLocationID"].astype(str)

  filter_val_df.loc[:, "PULocationID"] = filter_val_df["PULocationID"].astype(str)
  filter_val_df.loc[:, "DOLocationID"] = filter_val_df["DOLocationID"].astype(str)


In [35]:
val_data_dicts = filter_val_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

In [36]:
val_feature_matrix = vec.transform(val_data_dicts)

In [37]:
val_duration = filter_val_df["duration"].to_numpy()

In [38]:
val_predictions = reg.predict(val_feature_matrix)

In [39]:
mse = mean_squared_error(val_duration, val_predictions)

In [40]:
np.sqrt(mse)

np.float64(8.39373788463708)