In [None]:
print("MLOps Zoomcamp Module 1")
!pip install pyarrow

In [None]:
import pandas as pd

catagorical = ["PULocationID", "DOLocationID"]

def load_data_file(file):
    '''
    Created this function to use for both training and testing dataframes
    '''
    df = pd.read_parquet(file)
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = round(df.duration.dt.total_seconds() / 60, 2)

    # pick up and drop off location need to be either categories or strings so that
    # the DictVectorizer's one-hot encoding to work. It will not work if we keep them as 
    # numbers. 
    print(f"Pick up locations: {len(df['PULocationID'].unique())}")
    print(f"Drop off up locations: {len(df['DOLocationID'].unique())}")
    df["PULocationID"] = df["PULocationID"].astype("str")
    df["DOLocationID"] = df["DOLocationID"].astype("str")

    return df

In [None]:
df_train = load_data_file("./data/yellow_tripdata_2022-01.parquet")
df_train.head()

In [None]:
df_validate = load_data_file("./data/yellow_tripdata_2022-02.parquet")
df_validate.head()

## Q1. Downloading the data

We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
but instead of "**Green** Taxi Trip Records", we'll use "**Yellow** Taxi Trip Records".

Download the data for January and February 2022.

Read the data for January. How many columns are there?

In [None]:
print(f"January 2022 has {df_train.shape[1]} columns")
print(f"Total number of rows: {df_train.shape[0]}")
df_train.dtypes

print(f"February 2022 has {df_validate.shape[1]} columns")
print(f"Total number of rows: {df_validate.shape[0]}")
df_validate.dtypes

## Q2. Computing duration

Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 

What's the standard deviation of the trips duration in January?

* 41.45
* **46.45**
* 51.45
* 56.45

In [None]:
print(f"The standard deviation of the trips duration in January 2022 is {round(df_train.duration.std(), 2)}")
df_train.head()

## Q3. Dropping outliers

Next, we need to check the distribution of the `duration` variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

* 90%
* 92%
* 95%
* **98%**

In [None]:
import seaborn as sns

def remove_outliers(df):
  '''
  Here we decided that anything less that 1 minute and more than 1 hour are outliers
  '''
  lower_limit = 1
  upper_limit = 60
  num_rows = df.shape[0]
  print(f"Total number of rows: {num_rows}")
  outliers_lower_limit = (df["duration"] < lower_limit)
  outliers_upper_limit = (df["duration"] > upper_limit)
  len_lower = len(df['duration'][outliers_lower_limit])
  len_upper = len(df['duration'][outliers_upper_limit])
  print(f"Dropping lower limit outliers: {len_lower}")
  print(f"Dropping upper limit outliers: {len_upper}")

  # now we remove them... 
  df = df[~(outliers_lower_limit)]
  df = df[~(outliers_upper_limit)]
  df.reindex()
  percent_left = ((num_rows - len_lower - len_upper)/num_rows * 100)
  return percent_left, df


sns.boxplot(df_train["duration"])
percent_left, df_train = remove_outliers(df_train)
print(f"Fraction of the records left after you dropped the outliers: {round(percent_left, 1)}%")

In [None]:
_, df_validate = remove_outliers(df_validate)


In [None]:
sns.boxplot(df_train["duration"])

## Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 

* Turn the dataframe into a list of dictionaries
* Fit a dictionary vectorizer 
* Get a feature matrix from it

What's the dimensionality of this matrix (number of columns)?

* 2
* 155
* 345
* **515**
* 715

In [None]:
from sklearn.feature_extraction import DictVectorizer # <== use this to create a sparse one hot encoding of categorical variables
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


df_train_convert = df_train[catagorical]
train_dicts = df_train_convert.to_dict(orient='records')
print(df_train_convert.dtypes)
print(f"train_dicts looks like: {train_dicts[:2][1]}")

dv = DictVectorizer()
# now we convert the dictionary with one hot encoding for the categorical features
# and no conversion for the other features
X_train = dv.fit_transform(train_dicts)
# how does an item in a sparse array look like? 
# print(X_train.toarray()[:1])
print(f"Shape of the sparse matrix (row, col): {X_train.shape}")


## Q5. Training a model

Now let's use the feature matrix from the previous step to train a model. 

* Train a plain linear regression model with default parameters 
* Calculate the RMSE of the model on the training data

What's the RMSE on train?

* **6.99**
* 11.99
* 16.99
* 21.99

In [None]:
# remember the duration we calcluated from start and end time? We can use it
# as a target of the prediction... 
y_train = df_train["duration"].values

lr = LinearRegression()
lr.fit(X_train, y_train) # <=== Training time for all X_train values... 

print(f"shape: {X_train.shape}")
y_pred = lr.predict(X_train)

# then calculate the RMSE between X_train and y_pred values
print(f"RMSE: {round(mean_squared_error(y_train, y_pred, squared=False), 4)} mins")

In [None]:
sns.displot(y_pred)
sns.displot(y_train)

## Q6. Evaluating the model

Now let's apply this model to the validation dataset (February 2022). 

What's the RMSE on validation?

* **7.79**
* 12.79
* 17.79
* 22.79

In [None]:
df_validation = df_validate[catagorical]
print(df_validation.dtypes)
vali_dicts = df_validation.to_dict(orient='records')

# now we convert the dictionary with one hot encoding for the categorical features
# and no conversion for the other features
X_vali = dv.transform(vali_dicts)

print(f"shape: {X_vali.shape}")
y_pred = lr.predict(X_vali)

y_val = df_validate["duration"].values
print(f"RMSE: {round(mean_squared_error(y_val, y_pred, squared=False), 4)} mins")

## Homework

The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module.






## Submit the results

* Submit your results here: https://forms.gle/uYTnWrcsubi2gdGV7
* You can submit your solution multiple times. In this case, only the last submission will be used
* If your answer doesn't match options exactly, select the closest one


## Deadline

The deadline for submitting is 23 May 2023 (Tuesday), 23:00 CEST (Berlin time). 

After that, the form will be closed.