In [1]:
import polars as pl
from sklearn.feature_extraction import DictVectorizer
from snapml import LinearRegression
from sklearn.metrics import mean_squared_error


## Q1. Read the data for January. How many columns are there?  


In [2]:
df = pl.read_parquet("yellow_tripdata_2022-01.parquet")
df

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0
1,2022-01-01 00:40:15,2022-01-01 01:09:48,1.0,10.3,1.0,"""N""",138,161,1,33.0,3.0,0.5,13.0,6.55,0.3,56.35,2.5,0.0
2,2022-01-01 00:20:50,2022-01-01 00:34:58,1.0,5.07,1.0,"""N""",233,87,1,17.0,0.5,0.5,5.2,0.0,0.3,26.0,2.5,0.0
2,2022-01-01 00:13:04,2022-01-01 00:22:45,1.0,2.02,1.0,"""N""",238,152,2,9.0,0.5,0.5,0.0,0.0,0.3,12.8,2.5,0.0
2,2022-01-01 00:30:02,2022-01-01 00:44:49,1.0,2.71,1.0,"""N""",166,236,1,12.0,0.5,0.5,2.25,0.0,0.3,18.05,2.5,0.0
2,2022-01-01 00:48:52,2022-01-01 00:53:28,1.0,0.78,1.0,"""N""",236,141,2,5.0,0.5,0.5,0.0,0.0,0.3,8.8,2.5,0.0


In [3]:
len(df.columns)

19

## Q2. Computing duration  
Now let's compute the duration variable. It should contain the duration of a ride in minutes.  
What's the standard deviation of the trips duration in January?

In [4]:
df1 = df.with_columns(((df["tpep_dropoff_datetime"]-df["tpep_pickup_datetime"]).cast(pl.Float64)/60000000000).alias("duration"))

In [5]:
df1["duration"].std()

46.445305137765594

## Q3. Dropping outliers
Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [6]:
df2 = df1.filter(pl.col("duration").is_between(1,60))

In [7]:
(len(df2))/len(df1)

0.9827547930522406

## Q4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries
- Fit a dictionary vectorizer
- Get a feature matrix from it

What's the dimensionality of this matrix (number of columns)?

In [8]:
df3 = df2.select(["PULocationID","DOLocationID"])
df3 = df3.with_columns(
    pl.col("PULocationID").cast(str),
    pl.col("DOLocationID").cast(str)
)


In [9]:
data_list = df3.to_dicts()
data_list

[{'PULocationID': '142', 'DOLocationID': '236'},
 {'PULocationID': '236', 'DOLocationID': '42'},
 {'PULocationID': '166', 'DOLocationID': '166'},
 {'PULocationID': '114', 'DOLocationID': '68'},
 {'PULocationID': '68', 'DOLocationID': '163'},
 {'PULocationID': '138', 'DOLocationID': '161'},
 {'PULocationID': '233', 'DOLocationID': '87'},
 {'PULocationID': '238', 'DOLocationID': '152'},
 {'PULocationID': '166', 'DOLocationID': '236'},
 {'PULocationID': '236', 'DOLocationID': '141'},
 {'PULocationID': '141', 'DOLocationID': '229'},
 {'PULocationID': '114', 'DOLocationID': '90'},
 {'PULocationID': '234', 'DOLocationID': '113'},
 {'PULocationID': '246', 'DOLocationID': '79'},
 {'PULocationID': '43', 'DOLocationID': '140'},
 {'PULocationID': '239', 'DOLocationID': '151'},
 {'PULocationID': '148', 'DOLocationID': '141'},
 {'PULocationID': '237', 'DOLocationID': '107'},
 {'PULocationID': '7', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '263'},
 {'PULocationID': '263', 'DOLoc

In [10]:
vectorizer = DictVectorizer(sparse=False)
train_x = vectorizer.fit_transform(data_list)

In [11]:
train_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
train_x.shape[1]

515

## Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.  

Train a plain linear regression model with default parameters  
Calculate the RMSE of the model on the training data  
What's the RMSE on train?  


In [13]:

train_y = df2.select("duration").to_numpy()
train_y

array([[17.81666667],
       [ 8.4       ],
       [ 8.96666667],
       ...,
       [11.        ],
       [12.05      ],
       [27.        ]])

In [14]:
reg = LinearRegression()
reg.fit(train_x,train_y)

In [15]:
y_pred = reg.predict(train_x)

In [16]:
mean_squared_error(train_y, y_pred, squared=False)

6.986336777700631

## Q6. Evaluating the model
Now let's apply this model to the validation dataset (February 2022).

What's the RMSE on validation?

In [17]:
df_feb = pl.read_parquet("yellow_tripdata_2022-02.parquet")
df_feb = df_feb.with_columns(((df_feb["tpep_dropoff_datetime"]-df_feb["tpep_pickup_datetime"]).cast(pl.Float64)/60000000000).alias("duration"))
df_feb = df_feb.with_columns(
    pl.col("PULocationID").cast(str),
    pl.col("DOLocationID").cast(str)
)
df_feb = df_feb.filter(pl.col("duration").is_between(1,60))
val_data_list = df_feb.select(["PULocationID","DOLocationID"]).to_dicts()


In [18]:
val_x = vectorizer.transform(val_data_list)
val_y = df_feb.select("duration").to_numpy()

In [19]:
val_y_predict = reg.predict(val_x)

In [20]:
mean_squared_error(val_y, val_y_predict, squared=False)

7.785147563799474