In [3]:
!pip install -r requirements.txt

Collecting pyarrow (from -r requirements.txt (line 3))
  Using cached pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Using cached pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (39.9 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-17.0.0


In [1]:
import pandas as pd

In [4]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [5]:
df.shape

(3066766, 19)

## Q1. Downloading the data

Read the data for January. How many columns are there?

In [6]:
len(df.columns)

19

## Q2. Computing duration
What's the standard deviation of the trips duration in January?

In [7]:
df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
df["duration"] = df["duration"].dt.total_seconds() / 60
df["duration"].head()

0     8.433333
1     6.316667
2    12.750000
3     9.616667
4    10.833333
Name: duration, dtype: float64

In [8]:
df["duration"].std()

np.float64(42.59435124195458)

## Q3. Dropping outliers
What fraction of the records left after you dropped the outliers?

In [9]:
(len(df[(df.duration<=60) & (df.duration>=1)]) / df.shape[0]) * 100

98.1220282212598

In [10]:
df = df[(df.duration<=60) & (df.duration>=1)].copy()

## Q4. One-hot encoding
What's the dimensionality of this matrix (number of columns)?

In [11]:
from sklearn.feature_extraction import DictVectorizer

In [12]:
categoricals = ['PULocationID', 'DOLocationID']

In [13]:
df[categoricals] = df[categoricals].astype(str)

In [14]:
df_enc = df[categoricals].to_dict(orient="records")

In [15]:
df_enc[:5]

[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'}]

In [16]:
df[categoricals].head()

Unnamed: 0,PULocationID,DOLocationID
0,161,141
1,43,237
2,48,238
3,138,7
4,107,79


In [17]:
vectorizer = DictVectorizer()

In [18]:
X_train = vectorizer.fit_transform(df_enc)

In [19]:
print(f"shape: {X_train.shape}")

shape: (3009173, 515)


## Q5. Training a model

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [21]:
target = 'duration'
y_train = df[target].values

In [22]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [23]:
y_pred = lr.predict(X_train)

rmse = root_mean_squared_error(y_train, y_pred)

In [24]:
print(f"Train rmse: {rmse}")

Train rmse: 7.649261932106969


## Q6. Evaluating the model

In [25]:
def remove_outliers(df):
    """
    Add duration and remove outliers
    """
    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].dt.total_seconds() / 60

    return df[(df.duration<=60) & (df.duration>=1)].copy()    

In [26]:
df_raw_eval = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [27]:
df_raw_eval = remove_outliers(df_raw_eval)

In [28]:
val_dicts = df_raw_eval[categoricals].to_dict(orient='records')

In [29]:
X_val = vectorizer.transform(val_dicts) 
y_val = df_raw_eval.duration.values

In [30]:
y_pred = lr.predict(X_val)

In [31]:
print(f'test rmse: {root_mean_squared_error(y_val, y_pred)}')

test rmse: 13.3223404359502
