In [2]:
pip install pandas numpy scikit-learn pyarrow fastparquet


Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting pyarrow
  Downloading pyarrow-19.0.1-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-win_amd64.whl.metadata (4.3 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.9.1-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting fsspec (from fastparquet)
  Downloading fsspec-2025.2.0-py3-none-any.whl.metadata (11 kB)
Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
    ---------------------------------------


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: C:\Users\kanur\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
# MLOps Zoomcamp 2024 - Homework Notebook

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = "yellow_tripdata_2023-01.parquet"
df = pd.read_parquet(file_path)

# Display basic info
print(f"Number of columns: {df.shape[1]}")

# Convert pickup and dropoff datetime columns to datetime format
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Compute trip duration in minutes
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Standard deviation of trip duration
std_duration = df['trip_duration'].std()
print(f"Standard deviation of trip duration: {std_duration:.2f}")

# Remove outliers (keep only trips between 1 and 60 minutes)
df_filtered = df[(df['trip_duration'] >= 1) & (df['trip_duration'] <= 60)]
print(f"Fraction of records left: {len(df_filtered) / len(df):.2%}")

# One-hot encoding
categorical = ['PULocationID', 'DOLocationID']
df_filtered[categorical] = df_filtered[categorical].astype(str)
dv = DictVectorizer(sparse=False)
X = dv.fit_transform(df_filtered[categorical].to_dict(orient='records'))
print(f"Feature matrix dimensionality: {X.shape[1]}")

# Train-test split
y = df_filtered['trip_duration']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Compute RMSE on training data
y_pred_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print(f"RMSE on train: {rmse_train:.2f}")

# Load February dataset
file_path_feb = "yellow_tripdata_2023-02.parquet"
df_feb = pd.read_parquet(file_path_feb)
df_feb['tpep_pickup_datetime'] = pd.to_datetime(df_feb['tpep_pickup_datetime'])
df_feb['tpep_dropoff_datetime'] = pd.to_datetime(df_feb['tpep_dropoff_datetime'])
df_feb['trip_duration'] = (df_feb['tpep_dropoff_datetime'] - df_feb['tpep_pickup_datetime']).dt.total_seconds() / 60
df_feb_filtered = df_feb[(df_feb['trip_duration'] >= 1) & (df_feb['trip_duration'] <= 60)]
df_feb_filtered[categorical] = df_feb_filtered[categorical].astype(str)
X_val_feb = dv.transform(df_feb_filtered[categorical].to_dict(orient='records'))
y_val_feb = df_feb_filtered['trip_duration']

# Compute RMSE on validation data
y_pred_val = model.predict(X_val_feb)
rmse_val = np.sqrt(mean_squared_error(y_val_feb, y_pred_val))
print(f"RMSE on validation: {rmse_val:.2f}")


FileNotFoundError: [Errno 2] No such file or directory: 'yellow_tripdata_2023-01.parquet'