In [34]:
import pandas as pd

In [18]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [36]:
df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')

In [4]:
df.shape

(1154112, 7)

In [5]:
df.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number'],
      dtype='object')

In [37]:
df['duration'] = df['dropOff_datetime'] - df['pickup_datetime'] 
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,0 days 00:17:00
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,0 days 00:17:00
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,0 days 01:50:00
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,0 days 00:08:17
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,0 days 00:15:13
...,...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266,0 days 00:08:45
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,,0 days 00:57:36
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285,0 days 00:16:12
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285,0 days 00:19:26


In [38]:
df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)
df.duration.mean()

19.1672240937939

In [39]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [40]:
df['PUlocationID'] = df['PUlocationID'] .fillna(-1)
df['DOlocationID'] = df['DOlocationID'] .fillna(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PUlocationID'] = df['PUlocationID'] .fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOlocationID'] = df['DOlocationID'] .fillna(-1)


In [41]:
df.PUlocationID.value_counts()

-1.0      927008
 221.0      8330
 206.0      6797
 129.0      5379
 115.0      4082
           ...  
 111.0         5
 27.0          4
 34.0          3
 2.0           2
 110.0         1
Name: PUlocationID, Length: 262, dtype: int64

In [43]:
927008/len(df)*100

83.52732770722618

In [44]:
dv = DictVectorizer()

In [45]:
df.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number',
       'duration'],
      dtype='object')

In [46]:
features = ['PUlocationID', 'DOlocationID']
df[features] = df[features].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[features] = df[features].astype(str)


In [47]:
train_dicts = df[features].to_dict(orient='records')

In [48]:
X_train = dv.fit_transform(train_dicts)
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [49]:
target = 'duration'
y_train = df[target].values
y_train

array([17.        , 17.        ,  8.28333333, ..., 16.2       ,
       19.43333333, 36.        ])

In [50]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [51]:
y_pred = lr.predict(X_train)

In [53]:
mean_squared_error(y_train, y_pred, squared=False)

10.52851910722287

In [55]:
df_val = pd.read_parquet('./data/fhv_tripdata_2021-02.parquet')
df_val['duration'] = df_val['dropOff_datetime'] - df_val['pickup_datetime']
df_val['duration'] = df_val['duration'].apply(lambda td: td.total_seconds() / 60)
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val[features] = df_val[features].astype(str)

In [56]:
df_train = df
train_dicts = df_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [57]:
y_train = df_train[target].values
y_val = df_val[target].values

In [58]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

11.364433251334162