In [1]:
import pandas as pd
import numpy as np
from py_files.data_manager import clean_data, get_X_y
from py_files.features import distance, generate_features

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
X, y = get_X_y()
X = generate_features(X)
X

Unnamed: 0,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,distance_km
0,1,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.964630,40.765602,2.208255
1,0,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,2.944199
2,1,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,8.913189
3,1,2016-04-06 19:32:31,1,-74.010040,40.719971,-74.012268,40.706718,1.721302
4,1,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.782520,1.202960
...,...,...,...,...,...,...,...,...
1458639,1,2016-04-08 13:31:04,4,-73.982201,40.745522,-73.994911,40.740170,2.008469
1458640,0,2016-01-10 07:35:15,1,-74.000946,40.747379,-73.970184,40.796547,8.887738
1458641,1,2016-04-22 06:57:41,1,-73.959129,40.768799,-74.004433,40.707371,11.867992
1458642,0,2016-01-05 15:56:26,1,-73.982079,40.749062,-73.974632,40.757107,1.722575


In [12]:
# Read in our data.
train_df = pd.read_csv('~/Desktop/ACME_Projects/NYC_Taxi_Trip_Duration/data/train.csv')

# Drop our unnecessary columns.
train_df = train_df.drop(['id', 'store_and_fwd_flag'], axis=1)

# If vendor_id == 2, turn it into 0
train_df['vendor_id'] = train_df['vendor_id'].apply(lambda x: 0 if x == 2 else 1)

In [13]:
def manhattan_distance_km(df):
    """
    Calculate the Manhattan distance in kilometers between pickup and dropoff locations
    and add it as a new column 'distance_km' to the DataFrame.

    The Manhattan distance, also known as the L1 distance or taxicab distance, between two points
    on the Earth's surface is calculated by finding the absolute differences between their respective
    longitudes and latitudes and summing them up. This function computes the Manhattan distance
    in kilometers between the pickup and dropoff locations in a DataFrame, assuming a constant
    Earth radius of 6371 kilometers.

    Parameters:
    df (pandas.DataFrame): A DataFrame containing pickup and dropoff coordinates with columns
                           'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', and 'dropoff_latitude'.

    Returns:
    pandas.DataFrame: A DataFrame with an additional 'distance_km' column representing the Manhattan
                     distance in kilometers between pickup and dropoff locations.
    """
    # Radius of the Earth in kilometers
    earth_radius_km = 6371.0

    # Get the pickup and dropoff coordinates
    lon1 = df['pickup_longitude']
    lat1 = df['pickup_latitude']
    lon2 = df['dropoff_longitude']
    lat2 = df['dropoff_latitude']

    # Convert latitude and longitude from degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])


    # Calculate the differences in latitude and longitude
    delta_lat = abs(lat1 - lat2)
    delta_lon = abs(lon1 - lon2)

    # Calculate the Manhattan distance in kilometers
    df['distance_km'] = earth_radius_km * (delta_lat + delta_lon)

    return df

# Example coordinates for Times Square and Central Park in New York
lat1, lon1 = 40.758896, -73.985130  # Times Square
lat2, lon2 = 40.785091, -73.968285  # Central Park

train_df = manhattan_distance_km(train_df)
train_df
# print(f"The Manhattan distance is approximately {distance_meters:.2f} kilometers")


Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,distance_km
0,0,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.964630,40.765602,455,2.208255
1,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,663,2.944199
2,0,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,2124,8.913189
3,0,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.010040,40.719971,-74.012268,40.706718,429,1.721302
4,0,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.782520,435,1.202960
...,...,...,...,...,...,...,...,...,...,...
1458639,0,2016-04-08 13:31:04,2016-04-08 13:44:02,4,-73.982201,40.745522,-73.994911,40.740170,778,2.008469
1458640,1,2016-01-10 07:35:15,2016-01-10 07:46:10,1,-74.000946,40.747379,-73.970184,40.796547,655,8.887738
1458641,0,2016-04-22 06:57:41,2016-04-22 07:10:25,1,-73.959129,40.768799,-74.004433,40.707371,764,11.867992
1458642,1,2016-01-05 15:56:26,2016-01-05 16:02:39,1,-73.982079,40.749062,-73.974632,40.757107,373,1.722575
