In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Q1 Downloading the data

yellow_jan_2023_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
yellow_feb_2023_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"

df_jan = pd.read_parquet(yellow_jan_2023_url)
df_feb = pd.read_parquet(yellow_feb_2023_url)

df_jan.dtypes.count()

# 19

19

In [3]:
df_jan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [4]:
# Q2 Computing duration

df_jan['duration'] = df_jan.tpep_dropoff_datetime - df_jan.tpep_pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds() / 60)

df_jan['duration'].describe()

# 42.59

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [5]:
df_jan.shape

(3066766, 20)

In [6]:
# Q3 Dropping outliers

num_records = df_jan.shape[0]

df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]

num_records_no_outliers = df_jan.shape[0]

print(num_records_no_outliers / num_records)

# 98%

0.9812202822125979


In [7]:
df_jan.shape

(3009173, 20)

In [8]:
# Q4 One-hot encoding

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
df_jan[categorical] = df_jan[categorical].astype(str)

dv = DictVectorizer()

df_jan_dict = df_jan[categorical].to_dict(orient='records')
X_train = dv.fit_transform(df_jan_dict)

X_train.shape

# 2

(3009173, 515)

In [9]:
# Q5 Training a model

target = 'duration'
y_train = df_jan[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

# 7.64

7.649261027792376

In [10]:
# Q6 Evaluating the model

df_feb['duration'] = df_feb.tpep_dropoff_datetime - df_feb.tpep_pickup_datetime
df_feb.duration = df_feb.duration.apply(lambda td: td.total_seconds() / 60)

df_feb= df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)]

df_feb[categorical] = df_feb[categorical].astype(str)
df_feb_dict = df_feb[categorical + numerical].to_dict(orient='records')



In [11]:
X_val = dv.transform(df_feb_dict)

y_val = df_feb[target].values

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

# 7.81

7.811832836304415