## Reading the datasets into dataframes

In [1]:
import math
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [2]:
jan = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")

In [3]:
feb = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

## Questions and answers

In [4]:
#Q1 - number of columns in a dataframe
len(jan.columns)

19

In [5]:
#Computing duration in the data frame by subtracting pickup time from drop off time and converting to minuetes
jan['duration'] = jan.tpep_dropoff_datetime - jan.tpep_pickup_datetime 
jan['duration'] = jan['duration'].apply(lambda td: td.total_seconds() / 60)

In [6]:
#Q2 - using the describe method to compute standard deviation in the dataframe
jan['duration'].describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [7]:
#drop outliers in the dataframe
df = jan.loc[(jan['duration'] >= 1) & (jan['duration'] <= 60)]

In [8]:
#Q3 -  calculate the fraction of records left after dropping outliers
diff = len(jan) - len(df)
x = diff / len(jan)
fraction_dropped = x * 100
fraction_left = 100 - fraction_dropped
print(int(fraction_left),"%")

98 %


In [12]:
#Q4 Converted into a list of dictionaries, fit a dictionary vector $ calculated the dimensionality of the feature matrix gotten
categorical = ['PULocationID', 'DOLocationID']

df[categorical] = df[categorical].astype('str')

train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()

X_train = dv.fit_transform(train_dicts)

X_train.ndim
#checking the dimensionality of this matrix

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype('str')


2

In [20]:
#Q5 - training a model and evaluating on train
y_train = df['duration'].values

lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

math.sqrt(mean_squared_error(y_train, y_pred, squared = True))

7.649262223664065

In [16]:
#Q6 - Prep

feb['duration'] = feb.tpep_dropoff_datetime - feb.tpep_pickup_datetime 
feb['duration'] = feb['duration'].apply(lambda td: td.total_seconds() / 60)

df2 = feb.loc[(feb['duration'] >= 1) & (feb['duration'] <= 60)]

categorical = ['PULocationID', 'DOLocationID']

df2[categorical] = df2[categorical].astype('str')

val_dicts = df2[categorical].to_dict(orient='records')

X_val = dv.transform(val_dicts)

y_val = df2['duration'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[categorical] = df2[categorical].astype('str')


In [19]:
# Q6 - Evaluating with test set

y_pred2 = lr.predict(X_val)

math.sqrt(mean_squared_error(y_val, y_pred2, squared = True))

7.811813876460096