<a href="https://colab.research.google.com/github/kartik7702/Practice-Projects/blob/main/Mapup_Task_2_(Q1_Q5).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np


In [None]:
file_path = '/content/dataset-3.csv'

# Read the CSV file
df = pd.read_csv(file_path)

In [None]:
df

Unnamed: 0,id_start,id_end,distance
0,1001400,1001402,9.7
1,1001402,1001404,20.2
2,1001404,1001406,16.0
3,1001406,1001408,21.7
4,1001408,1001410,11.1
5,1001410,1001412,15.6
6,1001412,1001414,18.2
7,1001414,1001416,13.2
8,1001416,1001418,13.6
9,1001418,1001420,12.9


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id_start  44 non-null     int64  
 1   id_end    44 non-null     int64  
 2   distance  44 non-null     float64
dtypes: float64(1), int64(2)
memory usage: 1.2 KB


# Question 1: Distance Matrix Calculation

In [None]:
def calculate_distance_matrix(df):
    """
        Calculate a distance matrix based on the dataframe, df.
        Args:
            df (pandas.DataFrame): DataFrame containing columns: ID, Start, End, Distance
        Returns:
            pandas.DataFrame: Distance matrix
        """
# Create an empty DataFrame to store the distance matrix
    unique_ids = sorted(set(df['id_start'].unique()) | set(df['id_end'].unique()))
    distance_matrix = pd.DataFrame(np.zeros((len(unique_ids), len(unique_ids))), index=unique_ids, columns=unique_ids)

# Populate the distance matrix
    for index, row in df.iterrows():
        start = row['id_start']
        end = row['id_end']
        distance = row['distance']

# Update distance values in the matrix
        distance_matrix.at[start, end] = distance
        distance_matrix.at[end, start] = distance

# Calculate cumulative distances
    for i in unique_ids:
        for j in unique_ids:
            for k in unique_ids:
                if distance_matrix.at[i, j] == 0 and i != j and i != k and j != k:
                    if distance_matrix.at[i, k] != 0 and distance_matrix.at[k, j] != 0:
                        distance_matrix.at[i, j] = distance_matrix.at[i, k] + distance_matrix.at[k, j]

# Set diagonal values to 0
    np.fill_diagonal(distance_matrix.values, 0)

    return distance_matrix

In [None]:
# Call the function
result_df = calculate_distance_matrix(df)

# Print the result
print(result_df)

         1001400  1001402  1001404  1001406  1001408  1001410  1001412  \
1001400      0.0      9.7     29.9     45.9     67.6     78.7     94.3   
1001402      9.7      0.0     20.2     55.6     77.3     88.4    104.0   
1001404     29.9     20.2      0.0     16.0     97.5    108.6    124.2   
1001406     45.9     55.6     16.0      0.0     21.7    124.6    140.2   
1001408     67.6     77.3     97.5     21.7      0.0     11.1    161.9   
1001410     78.7     88.4    108.6    124.6     11.1      0.0     15.6   
1001412     94.3    104.0    124.2    140.2    161.9     15.6      0.0   
1001414    112.5    122.2    142.4    158.4    180.1    191.2     18.2   
1001416    125.7    135.4    155.6    171.6    193.3    204.4    220.0   
1001418    139.3    149.0    169.2    185.2    206.9    218.0    233.6   
1001420    152.2    161.9    182.1    198.1    219.8    230.9    246.5   
1001422    161.8    171.5    191.7    207.7    229.4    240.5    256.1   
1001424    173.2    182.9    203.1    

# Question 2: Unroll Distance Matrix

In [None]:
def unroll_distance_matrix(df) -> pd.DataFrame:
    """
    Unroll a distance matrix to a DataFrame in the style of the initial dataset.

    Args:
        df (pandas.DataFrame)

    Returns:
        pandas.DataFrame: Unrolled DataFrame containing columns 'id_start', 'id_end', and 'distance'.
    """
    #Empty list to store unrolled data
    unrolled_data = []


    #Iteratinge over the indices and columns of the original distance matrix
    for i in df.index:
        for j in df.columns:
            if i != j:
                unrolled_data.append({'id_start': i, 'id_end': j, 'distance': df.at[i, j]})

    df = pd.DataFrame(unrolled_data)

    return df


In [None]:
result_matrix = calculate_distance_matrix(df)
df = unroll_distance_matrix(result_matrix)
print(df)

      id_start   id_end  distance
0      1001400  1001402       9.7
1      1001400  1001404      29.9
2      1001400  1001406      45.9
3      1001400  1001408      67.6
4      1001400  1001410      78.7
...        ...      ...       ...
1801   1004356  1001470     159.8
1802   1004356  1001472     175.8
1803   1004356  1001488       4.0
1804   1004356  1004354       2.0
1805   1004356  1004355       5.4

[1806 rows x 3 columns]


# Question 3: Finding IDs within Percentage Threshold

In [None]:
def find_ids_within_ten_percentage_threshold(df, reference_id)->pd.DataFrame():
    """
    Find all IDs whose average distance lies within 10% of the average distance of the reference ID.

    Args:
        df (pandas.DataFrame)
        reference_id (int)

    Returns:
        pandas.DataFrame: DataFrame with IDs whose average distance is within the specified percentage threshold
                          of the reference ID's average distance.
    """
    # Filter rows with id_start equal to the reference value
    reference_rows = df[df['id_start'] == reference_id]

    # Calculate the average distance for the reference value
    average_distance = reference_rows['distance'].mean()

    # Calculate the threshold range (10% of the average distance)
    threshold = 0.1 * average_distance

    # Filter id_start values within the threshold range
    within_threshold = df[(df['distance'] >= (average_distance - threshold)) & (df['distance'] <= (average_distance + threshold))]

    # Get unique id_start values within the threshold range and sort them
    result_ids = sorted(within_threshold['id_start'].unique())

    return df



In [None]:
# Call the function
reference_id = 1
ten_percentage_threshold = find_ids_within_ten_percentage_threshold(df, reference_id)
print(ten_percentage_threshold)

      id_start   id_end  distance
0      1001400  1001402       9.7
1      1001400  1001404      29.9
2      1001400  1001406      45.9
3      1001400  1001408      67.6
4      1001400  1001410      78.7
...        ...      ...       ...
1801   1004356  1001470     159.8
1802   1004356  1001472     175.8
1803   1004356  1001488       4.0
1804   1004356  1004354       2.0
1805   1004356  1004355       5.4

[1806 rows x 3 columns]


# Question 4: Calculate Toll Rate

In [None]:
def calculate_toll_rate(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate toll rates for each vehicle type based on the unrolled DataFrame.

    Args:
        df (pandas.DataFrame)

    Returns:
        pandas.DataFrame
    """
    # Define rate coefficients for each vehicle type
    rate_coefficients = {'moto': 0.8, 'car': 1.2, 'rv': 1.5, 'bus': 2.2, 'truck': 3.6}

    # Iterate over the rate coefficients and add columns to the DataFrame
    for vehicle_type, rate_coefficient in rate_coefficients.items():
        # Calculate toll rates based on distance and rate coefficient
        df[vehicle_type] = df['distance'] * rate_coefficient

    return df


In [None]:
# Call the function
df_with_toll_rates = calculate_toll_rate(df)
print(df_with_toll_rates)

      id_start   id_end  distance    moto     car      rv     bus   truck
0      1001400  1001402       9.7    7.76   11.64   14.55   21.34   34.92
1      1001400  1001404      29.9   23.92   35.88   44.85   65.78  107.64
2      1001400  1001406      45.9   36.72   55.08   68.85  100.98  165.24
3      1001400  1001408      67.6   54.08   81.12  101.40  148.72  243.36
4      1001400  1001410      78.7   62.96   94.44  118.05  173.14  283.32
...        ...      ...       ...     ...     ...     ...     ...     ...
1801   1004356  1001470     159.8  127.84  191.76  239.70  351.56  575.28
1802   1004356  1001472     175.8  140.64  210.96  263.70  386.76  632.88
1803   1004356  1001488       4.0    3.20    4.80    6.00    8.80   14.40
1804   1004356  1004354       2.0    1.60    2.40    3.00    4.40    7.20
1805   1004356  1004355       5.4    4.32    6.48    8.10   11.88   19.44

[1806 rows x 8 columns]


# Question 5: Calculate Time-Based Toll Rates

In [None]:
from datetime import time, timedelta

In [None]:
ten_percentage_threshold.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1806 entries, 0 to 1805
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id_start    1806 non-null   int64  
 1   id_end      1806 non-null   int64  
 2   distance    1806 non-null   float64
 3   moto        1806 non-null   float64
 4   car         1806 non-null   float64
 5   rv          1806 non-null   float64
 6   bus         1806 non-null   float64
 7   truck       1806 non-null   float64
 8   start_time  1806 non-null   object 
 9   end_time    1806 non-null   object 
 10  start_day   1806 non-null   object 
 11  end_day     1806 non-null   object 
dtypes: float64(6), int64(2), object(4)
memory usage: 169.4+ KB


In [None]:
import pandas as pd
from datetime import time, datetime, timedelta

def calculate_time_based_toll_rates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate time-based toll rates for different time intervals within a day.

    Args:
        df (pandas.DataFrame)

    Returns:
        pandas.DataFrame
    """
    # Define time ranges and corresponding discount factors
    time_ranges = [(time(0, 0), time(10, 0), 0.8),
                   (time(10, 0), time(18, 0), 1.2),
                   (time(18, 0), time(23, 59, 59), 0.8)]

    # Create a dictionary with time range and discount factor information
    time_rate_info = {(start_time, end_time): discount_factor
                      for start_time, end_time, discount_factor in time_ranges}

    # Convert 'distance' to float
    df['distance'] = df['distance'].astype(float)

    # Apply time-based toll rates to the DataFrame
    for (start_time, end_time), discount_factor in time_rate_info.items():
        mask = (df['start_time'] >= start_time) & (df['end_time'] <= end_time)
        df.loc[mask, 'distance'] *= discount_factor

    # Round 'distance' to one decimal point
    df['distance'] = df['distance'].round(1)

    # Reorder columns
    df = df[['id_start', 'id_end', 'distance', 'start_day', 'start_time', 'end_day', 'end_time', 'moto', 'car', 'rv', 'bus', 'truck']]


    return df


In [None]:
df_with_time_based_rates = calculate_time_based_toll_rates(ten_percentage_threshold)
df_with_time_based_rates

Unnamed: 0,id_start,id_end,distance,start_day,start_time,end_day,end_time,moto,car,rv,bus,truck
0,1001400,1001402,6.2,Monday,00:09:00,Sunday,00:18:42,7.76,11.64,14.55,21.34,34.92
1,1001400,1001404,19.1,Monday,00:29:00,Sunday,00:58:54,23.92,35.88,44.85,65.78,107.64
2,1001400,1001406,29.4,Monday,00:45:00,Sunday,01:30:54,36.72,55.08,68.85,100.98,165.24
3,1001400,1001408,43.3,Monday,01:07:00,Sunday,02:14:36,54.08,81.12,101.40,148.72,243.36
4,1001400,1001410,50.4,Monday,01:18:00,Sunday,02:36:42,62.96,94.44,118.05,173.14,283.32
...,...,...,...,...,...,...,...,...,...,...,...,...
1801,1004356,1001470,102.2,Monday,02:39:00,Sunday,05:18:48,127.84,191.76,239.70,351.56,575.28
1802,1004356,1001472,112.5,Monday,02:55:00,Sunday,05:50:48,140.64,210.96,263.70,386.76,632.88
1803,1004356,1001488,2.6,Monday,00:04:00,Sunday,00:08:00,3.20,4.80,6.00,8.80,14.40
1804,1004356,1004354,1.3,Monday,00:02:00,Sunday,00:04:00,1.60,2.40,3.00,4.40,7.20
