In [2]:
import pandas as pd

### Q1. Downloading the data

In [3]:
df_train = pd.read_parquet("data/yellow_tripdata_2023-01.parquet")
df_eval = pd.read_parquet("data/yellow_tripdata_2023-02.parquet")

In [4]:
df_train.columns.__len__()

19

### Q2. Computing duration

In [5]:
df_train

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.30,1.00,0.5,0.00,0.0,1.0,14.30,2.5,0.00
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.10,1.0,N,43,237,1,7.90,1.00,0.5,4.00,0.0,1.0,16.90,2.5,0.00
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.90,1.00,0.5,15.00,0.0,1.0,34.90,2.5,0.00
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.90,1.0,N,138,7,1,12.10,7.25,0.5,0.00,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.40,1.00,0.5,3.28,0.0,1.0,19.68,2.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066761,2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,15.80,0.00,0.5,3.96,0.0,1.0,23.76,,
3066762,2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.80,,,112,75,0,22.43,0.00,0.5,2.64,0.0,1.0,29.07,,
3066763,2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,17.61,0.00,0.5,5.32,0.0,1.0,26.93,,
3066764,2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,18.15,0.00,0.5,4.43,0.0,1.0,26.58,,


In [6]:
df_train["duration"] = df_train["tpep_dropoff_datetime"] - df_train["tpep_pickup_datetime"]

In [7]:
df_train["duration"] = df_train["duration"].apply(lambda v: v.total_seconds() / 60)

In [8]:
df_train["duration"].std()

42.59435124195458

### Q3. Dropping outliers

In [9]:
left = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
len(left) / len(df_train)*100

98.1220282212598

In [10]:
df_train = left

### Q4. One-hot encoding



In [12]:
categorical = ["PULocationID", "DOLocationID"]
df_train[categorical] = df_train[categorical].astype(str)

In [13]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer()

In [14]:
train_dicts = df_train[categorical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

In [15]:
X_train.shape[1]

515

### Q5. Training a model



In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()

lr.fit(X_train, df_train.duration)

LinearRegression()

In [17]:
y_pred = lr.predict(X_train)

In [18]:
mean_squared_error(df_train.duration, y_pred, squared=False)

7.649261927686161

### Q6. Evaluating the model



In [19]:
df_eval["duration"] = df_eval["tpep_dropoff_datetime"] - df_eval["tpep_pickup_datetime"]
df_eval["duration"] = df_eval["duration"].apply(lambda v: v.total_seconds() / 60)
df_eval = df_eval[(df_eval.duration >= 1) & (df_eval.duration <= 60)]
df_eval[categorical] = df_eval[categorical].astype(str)
eval_dicts = df_eval[categorical].to_dict(orient="records")
X_eval = dv.transform(eval_dicts)
y_eval = lr.predict(X_eval)

In [20]:
mean_squared_error(df_eval.duration, y_eval, squared=False)

7.811817957524739