In [1]:
import pandas as pd
import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
yellow_jan_2023 = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')

In [3]:
num_records = yellow_jan_2023.shape[0]
num_cols = yellow_jan_2023.shape[1]
print('Columns:', num_cols)
print('Records:', num_records)

Columns: 19
Records: 3066766


In [5]:
yellow_jan_2023['duration'] = yellow_jan_2023.tpep_dropoff_datetime - yellow_jan_2023.tpep_pickup_datetime
yellow_jan_2023.duration = yellow_jan_2023.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
yellow_jan_2023.duration.describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [8]:
yellow_jan_2023 = yellow_jan_2023[(yellow_jan_2023.duration >= 1) & (yellow_jan_2023.duration <= 60)]

In [10]:
new_num_records = yellow_jan_2023.shape[0]
print('Percentage Records Remaining:', new_num_records / num_records)

Percentage Records Remaining: 0.9812202822125979


In [11]:
categorical = ['PULocationID', 'DOLocationID']

In [12]:
yellow_jan_2023[categorical] = yellow_jan_2023[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yellow_jan_2023[categorical] = yellow_jan_2023[categorical].astype(str)


In [13]:
yellow_jan_2023[categorical].dtypes

PULocationID    object
DOLocationID    object
dtype: object

In [14]:
list_dicts = yellow_jan_2023[categorical].to_dict(orient='records')

dv = DictVectorizer()
feature_matrix = dv.fit_transform(list_dicts)

feature_matrix_arr = feature_matrix.toarray()

dimensionality = feature_matrix_arr.shape
print('Dimensionality:', dimensionality)

Dimensionality: (3009173, 515)


In [15]:
target = 'duration'
y_train = yellow_jan_2023[target].values

In [16]:
lr = LinearRegression()
lr.fit(feature_matrix, y_train)

y_pred = lr.predict(feature_matrix)

root_mean_squared_error(y_train, y_pred)

7.6492619633678824

In [2]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [3]:
df_train = read_dataframe('./data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('./data/yellow_tripdata_2023-02.parquet')

In [6]:
categorical = ['PULocationID', 'DOLocationID']
df_train[categorical].dtypes

PULocationID    object
DOLocationID    object
dtype: object

In [7]:
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.81181893596011