In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# Q1. Downloading the data

In [3]:
df = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')

In [4]:
columns = df.shape[1]
columns

19

Answer: 19 columns

# Q2. Computing duration

In [5]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

In [6]:
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
df.duration.std()

46.44530513776499

Answer 46.45

# Q3. Dropping outliers

In [8]:
unfiltered_row_count = df.shape[0]

In [9]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [10]:
outliers_removed_row_count = df.shape[0]

In [11]:
outliers_removed_row_count / unfiltered_row_count

0.9827547930522406

Answer: 98%

# Q4. One-hot encoding

In [12]:
categorical = ['PULocationID', 'DOLocationID']

In [13]:
df[categorical] = df[categorical].astype(str)

In [14]:
train_dicts = df[categorical].to_dict(orient='records')

In [15]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [16]:
X_train.shape[1]

515

Answer: 515

# Q5. Training a model

In [17]:
target = 'duration'
y_train = df[target].values

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [19]:
y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

6.986190837370544

Answer: 6.99

# Q6. Evaluating the model

In [20]:
df_validation = pd.read_parquet('./data/yellow_tripdata_2022-02.parquet')

In [21]:
df_validation['duration'] = df_validation.tpep_dropoff_datetime - df_validation.tpep_pickup_datetime
df_validation.duration = df_validation.duration.apply(lambda td: td.total_seconds() / 60)

In [22]:
df_validation = df[(df.duration >= 1) & (df.duration <= 60)]

In [23]:
df_validation[categorical] = df_validation[categorical].astype(str)

In [24]:
validation_dicts = df_validation[categorical].to_dict(orient='records')

In [25]:
X_validation = dv.transform(validation_dicts)

In [26]:
y_validation = df_validation[target].values
y_validation_pred = lr.predict(X_validation)
mean_squared_error(y_validation, y_validation_pred, squared=False)

6.986190837370544

Answer: 6.99