In [1]:
!pip install pyarrow



In [None]:
# data file form here: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

## libraries

In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [3]:
pd.__version__

'1.4.2'

In [8]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')

In [9]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.70,1.00,0.5,0.00,0.00,1.0,22.70,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.80,1.0,N,140,236,1,10.00,3.50,0.5,3.75,0.00,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.70,1.0,N,236,79,1,23.30,3.50,0.5,3.00,0.00,1.0,31.30,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.40,1.0,N,79,211,1,10.00,3.50,0.5,2.00,0.00,1.0,17.00,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.80,1.0,N,211,148,1,7.90,3.50,0.5,3.20,0.00,1.0,16.10,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2964619,2,2024-01-31 23:45:59,2024-01-31 23:54:36,,3.18,,,107,263,0,15.77,0.00,0.5,2.00,0.00,1.0,21.77,,
2964620,1,2024-01-31 23:13:07,2024-01-31 23:27:52,,4.00,,,114,236,0,18.40,1.00,0.5,2.34,0.00,1.0,25.74,,
2964621,2,2024-01-31 23:19:00,2024-01-31 23:38:00,,3.33,,,211,25,0,19.97,0.00,0.5,0.00,0.00,1.0,23.97,,
2964622,2,2024-01-31 23:07:23,2024-01-31 23:25:14,,3.06,,,107,13,0,23.88,0.00,0.5,5.58,0.00,1.0,33.46,,


In [5]:
def download_data(url, filename):
    df = pd.read_parquet(url)
    df.to_parquet(filename)
    return df

def load_data(filename):
    return pd.read_parquet(filename)

def compute_duration(df):
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df

def filter_outliers(df, min_duration=1, max_duration=60):
    return df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)]

def one_hot_encode(df, columns):
    df.loc[:, columns] = df[columns].astype(str)  # Use .loc to avoid SettingWithCopyWarning
    data_dicts = df[columns].to_dict(orient='records')
    dv = DictVectorizer()
    X = dv.fit_transform(data_dicts)
    return X, dv

def train_model(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

def calculate_rmse(model, X, y):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return rmse

def main():
    # URLs for the datasets
    url_january = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet"
    url_february = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet"

    # Filenames for local storage
    filename_january = "yellow_tripdata_2021-01.parquet"
    filename_february = "yellow_tripdata_2021-02.parquet"

    # Download and load the data
    df_january = download_data(url_january, filename_january)
    df_february = download_data(url_february, filename_february)

    # Question 1
    num_columns_january = df_january.shape[1]
    print(f"Number of columns in January data: {num_columns_january}")

    # Question 2
    df_january = compute_duration(df_january)
    std_duration = df_january['duration'].std()
    print(f"Standard deviation of trip duration in January: {std_duration:.2f}")

    # Question 3
    df_january_filtered = filter_outliers(df_january)
    fraction_remaining = len(df_january_filtered) / len(df_january)
    print(f"Fraction of records remaining after filtering outliers: {fraction_remaining:.2%}")

    # Question 4
    columns_to_encode = ['PULocationID', 'DOLocationID']
    X_january, dv = one_hot_encode(df_january_filtered, columns_to_encode)
    dimensionality = X_january.shape[1]
    print(f"Dimensionality of the feature matrix: {dimensionality}")

    # Question 5
    y_january = df_january_filtered['duration'].values
    model = train_model(X_january, y_january)
    rmse_train = calculate_rmse(model, X_january, y_january)
    print(f"RMSE on training data: {rmse_train:.2f}")

    # Question 6
    df_february = compute_duration(df_february)
    df_february_filtered = filter_outliers(df_february)
    X_february = dv.transform(df_february_filtered[columns_to_encode].to_dict(orient='records'))
    y_february = df_february_filtered['duration'].values
    rmse_val = calculate_rmse(model, X_february, y_february)
    print(f"RMSE on validation data: {rmse_val:.2f}")

if __name__ == "__main__":
    main()


Number of columns in January data: 19
Standard deviation of trip duration in January: 131.20
Fraction of records remaining after filtering outliers: 98.06%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Dimensionality of the feature matrix: 518
RMSE on training data: 6.85
RMSE on validation data: 14.07
