In [3]:
import pandas as pd
from pymongo import MongoClient

# Read the CSV file
csv_file_path = "yellow_tripdata_2016-01.csv"
df = pd.read_csv(csv_file_path)

# Preprocess the data 
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Filter columns
columns_to_keep = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'dropoff_longitude', 'dropoff_latitude']
df = df[columns_to_keep]

# Calculate trip durations
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()

# Filter trips ending near Crate and Barrel
crate_and_barrel_latitude = 40.7258
crate_and_barrel_longitude = -74.0047
threshold = 0.001
filtered_df = df[
    (df['dropoff_latitude'].between(crate_and_barrel_latitude - threshold, crate_and_barrel_latitude + threshold)) &
    (df['dropoff_longitude'].between(crate_and_barrel_longitude - threshold, crate_and_barrel_longitude + threshold))
]

# Calculate short and long trips
short_trips = filtered_df[filtered_df['trip_duration'] < 600]
long_trips = filtered_df[filtered_df['trip_duration'] > 3660]

# Calculate trips by hour of the day
filtered_df['end_hour'] = filtered_df['tpep_dropoff_datetime'].dt.hour
trips_by_hour = filtered_df['end_hour'].value_counts().sort_index().reset_index()
trips_by_hour.columns = ['hour', 'number_of_trips']


# # Connect to MongoDB
# client = MongoClient('mongodb+srv://hdghgdhgdhhhd.mongodb.net/')  
# db = client['upliance_data']

# # Insert Preprocessed data into MongoDB or Export Locally
# db['filtered_trips'].insert_many(filtered_df.to_dict('records'))
# db['short_trips'].insert_many(short_trips.to_dict('records'))
# db['long_trips'].insert_many(long_trips.to_dict('records'))
# db['trips_by_hour'].insert_many(trips_by_hour.to_dict('records'))



# Export Preprocessed data Locally
filtered_df.to_csv("filtered_df.csv",index=False)
short_trips.to_csv("short_trips.csv",index=False)
long_trips.to_csv("long_trips.csv",index=False)
trips_by_hour.to_csv("trips_by_hour.csv",index=False)


