In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [33]:
def read_data_set(file_path: str) -> pd.DataFrame:
    """Load and preprocess taxi trip data from CSV or Parquet file.

    Args:
        file_path: Path to input file (must be .csv or .parquet)

    Returns:
        Preprocessed DataFrame with:
        - Valid trip durations (1-60 minutes)
        - Categorical location IDs as strings
    """
    reader = pd.read_csv if file_path.endswith('.csv') else pd.read_parquet
    df = reader(file_path)

    # Calculate and filter duration
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[df['duration'].between(1, 60)]

    # Convert to categorical
    df[['PULocationID', 'DOLocationID']] = df[['PULocationID', 'DOLocationID']].astype(str)

    return df

In [2]:
df = pd.read_parquet('yellow_tripdata_2023-01.parquet')

In [8]:
# Q1 : TOTAL NUMBER OF COLUMNS
len(df.columns)

19

In [12]:
# Q2: COMPUTING DURATION
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df['duration'].std()

np.float64(42.59435124195458)

In [14]:
# Q3: OUTLIERS
df_no_outliers = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
df_no_outliers.shape[0] / df.shape[0]

0.9812202822125979

In [23]:
# Q4: ONE-HOT ENCODING
categorical = ['PULocationID', 'DOLocationID']

df_no_outliers[categorical] = df_no_outliers[categorical].astype(str)

train_dicts = df_no_outliers[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outliers[categorical] = df_no_outliers[categorical].astype(str)


In [21]:
# Q4: ONE-HOT ENCODING
X_train.shape

(3009173, 515)

In [22]:
## Q5: TRAINING MODEL
from sklearn.metrics import root_mean_squared_error

target = 'duration'
y_train = df_no_outliers[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

7.649261931416412

In [34]:
## Q6: RMSE VALIDATION
df_val = read_data_set('yellow_tripdata_2023-02.parquet')

In [35]:
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [36]:
y_val = df_val[target].values

In [None]:
y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)