In [1]:
!python -V

Python 3.11.7


In [2]:
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error

In [3]:
def read_dataframe(filename):
    taxi_trip_jan = pd.read_parquet(filename)

    
    taxi_trip_jan.lpep_dropoff_datetime = pd.to_datetime(taxi_trip_jan.lpep_dropoff_datetime)
    taxi_trip_jan.lpep_pickup_datetime = pd.to_datetime(taxi_trip_jan.lpep_pickup_datetime)
    
    taxi_trip_jan['duration'] = taxi_trip_jan.lpep_dropoff_datetime - taxi_trip_jan.lpep_pickup_datetime
    
    taxi_trip_jan.duration = taxi_trip_jan.duration.apply(lambda x : x.total_seconds()/60)
    
    
    taxi_trip_jan = taxi_trip_jan[(taxi_trip_jan.duration >=1) & (taxi_trip_jan.duration <= 60)]
    
        
    categorical = ['PULocationID', 'DOLocationID']
    
    taxi_trip_jan[categorical]=  taxi_trip_jan[categorical].astype(str)
    return taxi_trip_jan
    

In [4]:
df_train = read_dataframe('./data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2021-02.parquet')

In [5]:
len(df_train), len(df_val)

(73908, 61921)

In [6]:
len(df_train.columns)

21

In [7]:
len(df_val.columns)

21

In [9]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

train_dict = df_train[categorical+numerical].to_dict(orient="records")
val_dict = df_val[categorical+numerical].to_dict(orient="records")
# Combine the training and validation data
combined_data = pd.concat([df_train, df_val], ignore_index=True)

# Create a dictionary of features
combined_dict = combined_data[categorical + numerical].to_dict(orient="records")

# Fit the DictVectorizer on the combined data
dv = DictVectorizer()
dv.fit(combined_dict)

# Transform the training and validation data separately
X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

In [10]:
X_val

<61921x512 sparse matrix of type '<class 'numpy.float64'>'
	with 185763 stored elements in Compressed Sparse Row format>

In [11]:
target = 'duration'

y_train = df_train[target].values

y_val = df_val[target].values

In [12]:


lr = LinearRegression()

lr.fit(X_train,y_train)

y_pred = lr.predict(X_val)

loss = mean_squared_error(y_val, y_pred, squared=False)

loss

10.499110709512484

In [13]:

lr = Lasso()

lr.fit(X_train,y_train)

y_pred = lr.predict(X_val)

loss = mean_squared_error(y_val, y_pred, squared=False)

loss

12.212583224318818

In [15]:
import pickle

In [18]:
with open("models/lin_reg","wb") as f_out:
    pickle.dump((dv,lr),f_out)

In [20]:
from joblib import dump, load
dump((dv,lr), 'linear_regression.joblib') 

['linear_regression.joblib']