In [1]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df_train = pd.read_parquet("/home/ibra/python-projects/mlops-zoomcamp/data/fhv_tripdata_2021-01.parquet")
df_val = pd.read_parquet("/home/ibra/python-projects/mlops-zoomcamp/data/fhv_tripdata_2021-02.parquet")

In [3]:
# Q1: How many records are there in January?
df_train.shape[0]

1154112

In [4]:
df_train["duration"] = df_train.dropOff_datetime - df_train.pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)

df_val["duration"] = df_val.dropOff_datetime - df_val.pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

In [5]:
# Q2: What's the average trip duration in January?
df_train.duration.mean()

19.167224093791006

In [6]:
# How many records did you drop?
df_train.shape[0] - df_train.duration[(df_train.duration >= 1) & (df_train.duration <= 60)].count()

44286

In [7]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

In [8]:
# Q3: What's the fraction of missing values for the PUlocationID?
df_train.PUlocationID.isna().mean()

0.8352732770722617

In [9]:
df_train.PUlocationID = df_train.PUlocationID.fillna("-1")
df_train.DOlocationID = df_train.DOlocationID.fillna("-1")
df_val.PUlocationID = df_val.PUlocationID.fillna("-1")
df_val.DOlocationID = df_val.DOlocationID.fillna("-1")

In [10]:
categorical = ["PUlocationID", "DOlocationID"]

df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

In [11]:
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [12]:
# Q4: What's the dimensionality of this matrix? (The number of columns)
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [13]:
target = "duration"

y_train = df_train[target].values
y_val = df_val[target].values

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [15]:
y_pred = lr.predict(X_train)

# Q5: What's the RMSE on train?
mean_squared_error(y_train, y_pred, squared=False)

10.52851938944385

In [16]:
y_pred = lr.predict(X_val)

# Q6: What's the RMSE on validation?
mean_squared_error(y_val, y_pred, squared=False)

11.014286426107942