In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#!pip install pyarrow


In [None]:
df_january2023 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
df_january2023.head()

In [None]:
print(f'Q1.Number of columns:{df_january2023.shape[1]}')

In [None]:
df_february2023 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")
df_february2023.head()

In [None]:
# function to calculate the total duration of the journey
# function takes the dataframe as an argument and adds a column "duration_in_minutes" and returns the dataframe.

def add_duration_in_minutes(df):
    df['duration_in_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()/60
    return df

In [None]:
df_february2023 = add_duration_in_minutes(df_february2023)
# dataframe with the duration coloumn.
df_february2023.head()

In [None]:
df_january2023 = add_duration_in_minutes(df_january2023)
# dataframe with the duration coloumn.
df_january2023.head()

In [None]:
std_trip_duration = df_january2023['duration_in_minutes'].std()
print(f'Q2. Standard Deviation of trips duration January: {std_trip_duration}')

In [None]:
def remove_outliers(df):
    total_records_before = len(df)
    df_filtered = df[(df['duration_in_minutes'] >= 1) & (df['duration_in_minutes'] <= 60)]
    total_records_after = len(df_filtered)
    print(total_records_before, total_records_after)
    fraction_remaining = total_records_after / total_records_before
    return df_filtered, fraction_remaining
    

In [None]:
df_january2023_filtered, fraction_january2023 = remove_outliers(df_january2023)
print('Filtered dataset for January 2023: ', df_january2023_filtered.head())
print('Filtered percentage of data for January 2023: ', fraction_january2023*100)

In [None]:
df_february2023_filtered, fraction_february2023 = remove_outliers(df_february2023)
print('Filtered dataset for January 2023: ', df_february2023_filtered.head())
print('Filtered percentage of data for January 2023: ', fraction_february2023*100)

In [None]:
def vectorise_location_onehot_encoding(df):
    df.loc[:, 'PULocationID'] = df['PULocationID'].astype(str)
    df.loc[:, 'DOLocationID'] = df['DOLocationID'].astype(str)
    # Select only the PULocationID and DOLocationID columns
    location_data = df[['PULocationID', 'DOLocationID']]

    encoder = OneHotEncoder(sparse=True, dtype=int)        
    encoded_locations = encoder.fit_transform(location_data)
    sparse_df = pd.DataFrame.sparse.from_spmatrix(encoded_locations, columns=encoder.get_feature_names_out())
    print("Feature Matrix (Sparse):")
    print(sparse_df.head())
    print("\nFeature Names:")
    print(encoder.get_feature_names_out())
    return sparse_df

In [None]:
df_january2023_encoded = vectorise_location_onehot_encoding(df_january2023_filtered)
df_january2023_encoded.head()

In [None]:
def train_and_evaluate_model(feature_df, target_df):
    model = LinearRegression()
    model.fit(feature_df, target_df)
    target_prediction_train = model.predict(feature_df)
    rmse= np.sqrt(mean_squared_error(target_df, target_prediction_train))
    return model, rmse

In [None]:
target_january2023 = df_january2023_filtered["duration_in_minutes"]
model, rmse = train_and_evaluate_model(df_january2023_encoded, target_january2023)
print(f'RMSE of the model on the training data: {rmse}')

In [None]:
df_february2023_encoded = vectorise_location_onehot_encoding(df_february2023_filtered)
df_february2023_encoded.head()

In [None]:
target_february_2023 = df_february2023_filtered["duration_in_minutes"]
model, rmse = train_and_evaluate_model(df_february2023_encoded, target_february_2023)
print(f'RMSE of the model on the training data: {rmse}')