In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")

In [3]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.30,1.00,0.5,0.00,0.0,1.0,14.30,2.5,0.00
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.10,1.0,N,43,237,1,7.90,1.00,0.5,4.00,0.0,1.0,16.90,2.5,0.00
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.90,1.00,0.5,15.00,0.0,1.0,34.90,2.5,0.00
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.90,1.0,N,138,7,1,12.10,7.25,0.5,0.00,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.40,1.00,0.5,3.28,0.0,1.0,19.68,2.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066761,2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,15.80,0.00,0.5,3.96,0.0,1.0,23.76,,
3066762,2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.80,,,112,75,0,22.43,0.00,0.5,2.64,0.0,1.0,29.07,,
3066763,2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,17.61,0.00,0.5,5.32,0.0,1.0,26.93,,
3066764,2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,18.15,0.00,0.5,4.43,0.0,1.0,26.58,,


In [4]:
len(df.columns)

19

In [5]:
df['duration_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

In [6]:
df["duration_minutes"].std()

42.594351241920904

In [7]:
filtered_df = df[(df['duration_minutes'] >= 1) & (df['duration_minutes'] <= 60)]

In [8]:
fraction = len(filtered_df)/len(df)

In [9]:
fraction 

0.9812202822125979

# on-hot encoding

In [10]:
df['PULocationID'] = df['PULocationID'].astype(str)
df['DOLocationID'] = df['DOLocationID'].astype(str)


In [11]:
dicts = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')


In [12]:
dicts

[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'},
 {'PULocationID': '161', 'DOLocationID': '137'},
 {'PULocationID': '239', 'DOLocationID': '143'},
 {'PULocationID': '142', 'DOLocationID': '200'},
 {'PULocationID': '164', 'DOLocationID': '236'},
 {'PULocationID': '141', 'DOLocationID': '107'},
 {'PULocationID': '234', 'DOLocationID': '68'},
 {'PULocationID': '79', 'DOLocationID': '264'},
 {'PULocationID': '164', 'DOLocationID': '143'},
 {'PULocationID': '138', 'DOLocationID': '33'},
 {'PULocationID': '33', 'DOLocationID': '61'},
 {'PULocationID': '79', 'DOLocationID': '186'},
 {'PULocationID': '90', 'DOLocationID': '48'},
 {'PULocationID': '113', 'DOLocationID': '255'},
 {'PULocationID': '237', 'DOLocationID': '239'},
 {'PULocationID': '143', 'DOLocationID': '229'},
 {'PULocationID': '137', 'DOLocat

In [12]:
dv = DictVectorizer()
feature_matrix = dv.fit_transform(dicts)

In [15]:
num_columns = feature_matrix.shape[1]

In [16]:
num_columns

518

In [13]:
y_train = df["duration_minutes"].values

In [21]:
feature_matrix

<3066766x518 sparse matrix of type '<class 'numpy.float64'>'
	with 6133532 stored elements in Compressed Sparse Row format>

In [14]:
lr = LinearRegression()
lr.fit(feature_matrix, y_train)

LinearRegression()

In [15]:
y_pred = lr.predict(feature_matrix)

In [16]:
y_pred

array([12.66326601, 12.61787382, 12.68170698, ..., 13.18101882,
       14.26554938, 12.77554415])

In [17]:
rmse = mean_squared_error(y_train, y_pred, squared=False) 


In [18]:
np.sqrt(rmse)

6.480470068365092

In [19]:
y = df['duration_minutes'].values

# Train the linear regression model
model = LinearRegression()
model.fit(feature_matrix, y)

# Make predictions
predictions = model.predict(feature_matrix)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y, predictions))

rmse

41.99649230697585

In [20]:
df_Feb = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

In [21]:
df_Feb['duration_minutes'] = (df_Feb['tpep_dropoff_datetime'] - df_Feb['tpep_pickup_datetime']).dt.total_seconds() / 60

In [22]:
df_Feb['PULocationID'] = df_Feb['PULocationID'].astype(str)
df_Feb['DOLocationID'] = df_Feb['DOLocationID'].astype(str)
dicts_feb = df_Feb[['PULocationID', 'DOLocationID']].to_dict(orient='records')


In [23]:
feature_matrix_feb = dv.transform(dicts_feb)

In [25]:
y_val =  df_Feb["duration_minutes"].values

In [24]:
predictions_feb = model.predict(feature_matrix_feb)


In [27]:
mean_squared_error(y_val, predictions_feb, squared=False)

42.24824555186226