In [1]:
import csv
import time
from collections import defaultdict


In [2]:
file_path = "taxi-data-sorted-small.csv"  

- task 1 : I will clean up the data before the main processing, a line might not include all of the fields. If a data line is not correctly formatted, I will  drop that line and will not consider it.

- with that i will compute the top ten taxis that have had the largest number of drivers

In [4]:
start = time.time()

taxi_drivers = defaultdict(set)
bad_lines = 0
total_lines = 0

with open(file_path, "r", encoding="utf-8", errors="replace") as f:
    reader = csv.reader(f)
    for row in reader:
        total_lines += 1

        # basic cleanup
        if len(row) != 17:
            bad_lines += 1
            continue
            
        if row[0].strip().lower() == "medallion":
            bad_lines += 1
            continue

        taxi_id = row[0].strip()        
        driver_id = row[1].strip()      

        
        if taxi_id == "" or driver_id == "":
            bad_lines += 1
            continue

        # store unique drivers for each taxi
        taxi_drivers[taxi_id].add(driver_id)


pairs = []
for taxi_id in taxi_drivers:
    pairs.append((taxi_id, len(taxi_drivers[taxi_id])))

pairs = sorted(pairs, key=lambda x: x[1], reverse=True)

top10 = pairs[:10]

end = time.time()

print("Top 10 taxis with most unique drivers:")
for taxi_id, num_drivers in top10:
    print(taxi_id, num_drivers)

print("\nTotal lines read:", total_lines)
print("Bad/skipped lines:", bad_lines)
print("Processing time: ", end - start)


Top 10 taxis with most unique drivers:
3C08296D0EB7ABE24FB7328DE9B62813 20
65EFB7D02BAD12D5DE757CB5D350944E 20
CD7B02776E69483397952DC5E1F44DFE 19
799153A138F4E8334A1A95AE25040B83 19
55D311AD2752BC278BEF7386B25B28A9 19
3B6AE3CF05F34ADC91DC68D20F2EB913 19
F36564AB9C6EA3B6373EB0E1680A447A 19
7DEB25123AE57111F912C0EBF92F1F63 19
6B15D153B49701AD86A2E62468990B73 18
F2A08960199BCDB7EE19411A8E7A4C5D 18

Total lines read: 1999999
Bad/skipped lines: 0
Processing time:  11.43187689781189


In task I ->  I worked with the NYC taxi dataset to find the top 10 taxis that had the highest number of unique drivers. The data was read line by line in Python, and some basic cleaning was done by skipping rows that were incorrectly formatted, contained headers, or had missing values. For each valid entry, I stored the drivers associated with each taxi using a set so that duplicate drivers were not counted multiple times. After going through the dataset, the taxis were sorted based on how many different drivers they had, and the top 10 were selected. I also measured the total execution time to understand how long the analysis took.

- task 2

so, if Money per minute = (Total amount earned
                                            / trip time in minutes)
who the top 10 best drivers are in terms of their average earned money per minute spent carrying a customer.


In [5]:
start = time.time()

driver_money = defaultdict(float)
driver_time = defaultdict(float)

bad_lines = 0
total_lines = 0

with open(file_path, "r", encoding="utf-8", errors="replace") as f:
    reader = csv.reader(f)
    for row in reader:
        total_lines += 1

        # basic checks
        if len(row) != 17:
            bad_lines += 1
            continue

        if row[0].strip().lower() == "medallion":
            bad_lines += 1
            continue

        driver_id = row[1].strip()

        try:
            trip_time_secs = float(row[4])
            total_amount = float(row[16])
        except:
            bad_lines += 1
            continue

        if trip_time_secs <= 0:
            bad_lines += 1
            continue

        # accumulate totals
        driver_money[driver_id] += total_amount
        driver_time[driver_id] += trip_time_secs

# compUte money per minute
driver_rates = []
for driver in driver_money:
    minutes = driver_time[driver] / 60
    rate = driver_money[driver] / minutes
    driver_rates.append((driver, rate))

# soort and get top 10
driver_rates = sorted(driver_rates, key=lambda x: x[1], reverse=True)
top10_drivers = driver_rates[:10]

end = time.time()

print("Top 10 drivers by money earned per minute:")
for driver, rate in top10_drivers:
    print(driver, round(rate, 2))

print("\nTotal lines read:", total_lines)
print("Bad/skipped lines:", bad_lines)
print("Processing time (seconds):", end - start)


Top 10 drivers by money earned per minute:
57D463B8F4C3382081F206E6869AA095 3780.0
69B6FBD28F84175AB1504F6BFF001A49 2400.0
0838C4C7DDFD9391AD66E316B5608B26 1815.0
30B2ACBAF002305533FF0D31D34A7C06 702.0
4C3B2A31227663A59E1AA7B45157160D 625.0
A0AC85170C37E1D076ADE05B0238C58E 540.0
08026D69508127F4DE855678ABCE7E0A 375.0
D3B2DEC5DB78D91D9AFADA53B3B521B5 330.0
6E1D7195E38AA7A36B375C1C60AD8632 317.31
7826BDE4CE3E44EE1BB7B926A38A4B2A 279.86

Total lines read: 1999999
Bad/skipped lines: 8920
Processing time (seconds): 14.024472713470459


- TASK 3 


Profit Ratio = (Surcharge Amount in US Dollar) / (Travel Distance in miles) 
We are interested to know the time of the day that has the highest profit ratio. 

In [6]:
start = time.time()

hour_surcharge = defaultdict(float)
hour_distance = defaultdict(float)

bad_lines = 0
total_lines = 0

with open(file_path, "r", encoding="utf-8", errors="replace") as f:
    reader = csv.reader(f)
    for row in reader:
        total_lines += 1
        
        if len(row) != 17:
            bad_lines += 1
            continue
        if row[0].strip().lower() == "medallion":
            bad_lines += 1
            continue

        pickup_time = row[2].strip()
        try:
            hour = int(pickup_time[11:13])
            surcharge = float(row[12])
            distance = float(row[5])
        except:
            bad_lines += 1
            continue
            
        if distance <= 0:
            bad_lines += 1
            continue

        hour_surcharge[hour] += surcharge
        hour_distance[hour] += distance

hour_ratio = {}
for h in hour_surcharge:
    hour_ratio[h] = hour_surcharge[h] / hour_distance[h]

# findING THE best hour
best_hour = max(hour_ratio, key=hour_ratio.get)
best_ratio = hour_ratio[best_hour]

end = time.time()

print("Best hour to work (0-23):", best_hour)
print("BestB profit ratio (surcharge/distance):", round(best_ratio, 6))

print("\nTotal lines read:", total_lines)
print("Bad/skipped lines:", bad_lines)
print("Processing time (seconds):", end - start)


Best hour to work (0-23): 19
Best profit ratio (surcharge/distance): 0.279797

Total lines read: 1999999
Bad/skipped lines: 9729
Processing time (seconds): 12.32653546333313


- The results show that hour 19 (7:00 PM) has the highest profit ratio, indicating that evening hours are the most profitable for taxi drivers in terms of surcharge earned per mile. The total processing time was 12.32653546333313.