In [1]:
import pandas as pd
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
pd.__version__

'1.4.2'

In [3]:
sklearn.__version__

'1.0.2'

In [4]:
Jan_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [5]:
Jan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [6]:
Feb_df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

In [7]:
# Question 1
col_count = Jan_df.shape[1]
print(f'There are {col_count} columns in January dataset.')

There are 19 columns in January dataset.


In [8]:
# Question 2
# find the difference and convert it into minutes
Jan_df['duration'] = (Jan_df.tpep_dropoff_datetime - Jan_df.tpep_pickup_datetime).dt.total_seconds()/60
print('The statistics for the durations are:')
Jan_df.duration.describe()

The statistics for the durations are:


count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [9]:
# Question 3
original_count = Jan_df.shape[0]
drop_outliers_Jan_df = Jan_df[(Jan_df['duration'] >=1) & (Jan_df['duration'] <=60)]
new_count = drop_outliers_Jan_df.shape[0]
print(f'{new_count*100/original_count:.2f}% of the records left after dropping the outliers.')

98.12% of the records left after dropping the outliers.


In [10]:
# Question 4
select_feature_categorical = ['PULocationID', 'DOLocationID']

# 4.1: convert the categorical int dtype into str
drop_outliers_Jan_df[select_feature_categorical] = drop_outliers_Jan_df[select_feature_categorical].astype(str)
# 4.2: turn dataframe to a list of dictionary
train_dicts = drop_outliers_Jan_df[select_feature_categorical].to_dict(orient='records')
# 4.3: fit the dictionary vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop_outliers_Jan_df[select_feature_categorical] = drop_outliers_Jan_df[select_feature_categorical].astype(str)


In [11]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [12]:
# 4.4 get the dimension of the matrix
matrix_dim = X_train.shape
print(f'The dimension of the matrtx is: {matrix_dim}')

The dimension of the matrtx is: (3009173, 515)


In [13]:
drop_outliers_Jan_df[select_feature_categorical].nunique()

PULocationID    255
DOLocationID    260
dtype: int64

In [14]:
drop_outliers_Jan_df.shape

(3009173, 20)

In [15]:
# Question 5
target = 'duration'
y_train = drop_outliers_Jan_df[target].values

# fit the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [16]:
y_train_pred = lr.predict(X_train)
train_rmse = mean_squared_error(y_train_pred, y_train, squared=False)
print(f'RMSE on trainining dataset: {train_rmse}.')

RMSE on trainining dataset: 7.649261027919939.


In [17]:
# Question 6: write the whole data processing steps into one function. 
def data_processing(df):
    # Step 1: create the target column and convert it into minutes
    df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds()/60
#     print(df.duration.describe())
#     print()

    # Step 2: drop outliers and calculate the percentage left after dropping
    original_count = df.shape[0]
    drop_outliers_df = df[(df['duration'] >=1) & (df['duration'] <=60)]
    new_count = drop_outliers_df.shape[0]
#     print(f'{new_count*100/original_count:.2f}% of the records left after dropping the outliers.')
#     print()

    # Step 3: select categorical features and convert the data type
    select_feature_categorical = ['PULocationID', 'DOLocationID']
    drop_outliers_df[select_feature_categorical] = drop_outliers_df[select_feature_categorical].astype(str)

    return drop_outliers_df

In [18]:
new_Feb_df = data_processing(Feb_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop_outliers_df[select_feature_categorical] = drop_outliers_df[select_feature_categorical].astype(str)


In [19]:
# apply the trained model on the evaluation dataset
eval_dicts = new_Feb_df[select_feature_categorical].to_dict(orient='records')
# fit the dictionary vectorizer
X_eval = dv.transform(eval_dicts) # for this step, we should use transform instead of fit_transform
y_eval = new_Feb_df[target].values

y_pred = lr.predict(X_eval)
eval_rmse = mean_squared_error(y_pred, y_eval, squared=False)
print(f'RMSE on evaluation dataset is: {eval_rmse}')

RMSE on evaluation dataset is: 7.811832638273232
