In [256]:
import pandas as pd
import numpy as np
from scipy.spatial import distance_matrix
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
import networkx as nx

In [283]:
df3 = pd.read_csv('dataset-3.csv', index_col=None)

In [284]:
df3

Unnamed: 0,id_start,id_end,distance
0,1001400,1001402,9.7
1,1001402,1001404,20.2
2,1001404,1001406,16.0
3,1001406,1001408,21.7
4,1001408,1001410,11.1
5,1001410,1001412,15.6
6,1001412,1001414,18.2
7,1001414,1001416,13.2
8,1001416,1001418,13.6
9,1001418,1001420,12.9


In [280]:
df3 = df3.sort_values('id_start')
df3['cs'] = np.cumsum(df3['distance'])

In [146]:
df3

Unnamed: 0,id_start,id_end,distance,cs
0,1001400,1001402,9.7,9.7
1,1001402,1001404,20.2,29.9
2,1001404,1001406,16.0,45.9
3,1001406,1001408,21.7,67.6
4,1001408,1001410,11.1,78.7
5,1001410,1001412,15.6,94.3
6,1001412,1001414,18.2,112.5
7,1001414,1001416,13.2,125.7
8,1001416,1001418,13.6,139.3
9,1001418,1001420,12.9,152.2


Question 1: Distance Matrix Calculation
Create a function named calculate_distance_matrix that takes the dataset-3.csv as input and
generates a DataFrame representing distances between IDs.

The resulting DataFrame should have cumulative distances along known routes, with diagonal values set to 0. If distances between toll locations A to B and B to C are known, then the distance from A to C should be the sum of these distances. Ensure the matrix is symmetric,
accounting for bidirectional distances between toll locations (i.e. A to B is equal to B to A).

In [302]:
def calculate_distance_matrix(data):
    # importng the dataset into the function
    df = pd.read_csv(data)
    
    # creating a grapth using the edges from the df
    G = nx.from_pandas_edgelist(df, source="id_start", target="id_end", edge_attr="distance")

    # creating a new dataset which iterates through all the paths
    out = pd.DataFrame(
        [
            {"start": c1, "end": c2, "cumulative_distance": d}
            
            # iterating through the distances
            for c1, rest in nx.all_pairs_dijkstra_path_length(G, weight="distance")
            for c2, d in rest.items()
        ]
    )

    # printing and pivoting the table
    return out.pivot(index="start", columns="end", values="cumulative_distance")
  
# taking the dataset path into as a variable
path = 'dataset-3.csv'

#calling the function
output = calculate_distance_matrix(path)
print(output)

end      1001400  1001402  1001404  1001406  1001408  1001410  1001412  \
start                                                                    
1001400      0.0      9.7     29.9     45.9     67.6     78.7     94.3   
1001402      9.7      0.0     20.2     36.2     57.9     69.0     84.6   
1001404     29.9     20.2      0.0     16.0     37.7     48.8     64.4   
1001406     45.9     36.2     16.0      0.0     21.7     32.8     48.4   
1001408     67.6     57.9     37.7     21.7      0.0     11.1     26.7   
1001410     78.7     69.0     48.8     32.8     11.1      0.0     15.6   
1001412     94.3     84.6     64.4     48.4     26.7     15.6      0.0   
1001414    112.5    102.8     82.6     66.6     44.9     33.8     18.2   
1001416    125.7    116.0     95.8     79.8     58.1     47.0     31.4   
1001418    139.3    129.6    109.4     93.4     71.7     60.6     45.0   
1001420    152.2    142.5    122.3    106.3     84.6     73.5     57.9   
1001422    161.8    152.1    131.9    

Question 2: Unroll Distance Matrix
Create a function unroll_distance_matrix that takes the DataFrame created in Question 1. 
The resulting DataFrame should have three columns: columns id_start, id_end, and distance.

All the combinations except for same id_start to id_end must be present in the rows with 
their distance values from the input DataFrame.

In [300]:
def unroll_distance_matrix():
    df1 = calculate_distance_matrix(path)
    print(df1)

In [307]:
def reverse_distance_matrix(pivoted_df):
    # Melt the pivoted DataFrame to transform it back to the original format
    melted_df = pd.melt(pivoted_df.reset_index(), id_vars=['start'], var_name='end', value_name='cumulative_distance')

    # Create a DataFrame to store the edges
    edges_df = melted_df[['start', 'end', 'cumulative_distance']]

    return edges_df

# Example usage:
# Assuming 'output' is the pivoted DataFrame obtained from the calculate_distance_matrix function
reversed_df = reverse_distance_matrix(calculate_distance_matrix(path))
print(reversed_df)


        start      end  cumulative_distance
0     1001400  1001400                  0.0
1     1001402  1001400                  9.7
2     1001404  1001400                 29.9
3     1001406  1001400                 45.9
4     1001408  1001400                 67.6
...       ...      ...                  ...
1844  1001472  1004356                175.8
1845  1001488  1004356                  4.0
1846  1004354  1004356                  2.0
1847  1004355  1004356                  4.0
1848  1004356  1004356                  0.0

[1849 rows x 3 columns]


In [304]:
import pandas as pd

def reverse_distance_matrix(pivoted_df):
    # Reset index to include 'start' as a regular column
    melted_df = pivoted_df.reset_index()

    # Melt the pivoted DataFrame to transform it back to the original format
    melted_df = pd.melt(melted_df, id_vars=['start'], var_name='end', value_name='cumulative_distance')

    # Filter rows where 'start' is not equal to 'end' to remove unnecessary entries
    edges_df = melted_df[melted_df['start'] != melted_df['end']]

    return edges_df[['start', 'end', 'cumulative_distance']]

# Assuming 'output' is the pivoted DataFrame obtained from the calculate_distance_matrix function
reversed_df = reverse_distance_matrix(output)
print(reversed_df)


        start      end  cumulative_distance
1     1001402  1001400                  9.7
2     1001404  1001400                 29.9
3     1001406  1001400                 45.9
4     1001408  1001400                 67.6
5     1001410  1001400                 78.7
...       ...      ...                  ...
1843  1001470  1004356                159.8
1844  1001472  1004356                175.8
1845  1001488  1004356                  4.0
1846  1004354  1004356                  2.0
1847  1004355  1004356                  4.0

[1806 rows x 3 columns]


In [310]:
import pandas as pd

def reverse_distance_matrix(pivoted_df):
    # Reset index to include 'start' as a regular column
    melted_df = pivoted_df.reset_index()

    # Melt the pivoted DataFrame to transform it back to the original format
    melted_df = pd.melt(melted_df, id_vars=['start'], var_name='end', value_name='distance')

    # Filter rows where 'distance' is not equal to NaN (non-diagonal entries)
    edges_df = melted_df.dropna(subset=['distance'])
    edges_df.drop_duplicates(subset=['end'], keep=False)

    return edges_df[['start', 'end', 'distance']]

# Example usage:
# Assuming 'output' is the pivoted DataFrame obtained from the calculate_distance_matrix function
reversed_df = reverse_distance_matrix(output)
print(reversed_df)


        start      end  distance
0     1001400  1001400       0.0
1     1001402  1001400       9.7
2     1001404  1001400      29.9
3     1001406  1001400      45.9
4     1001408  1001400      67.6
...       ...      ...       ...
1844  1001472  1004356     175.8
1845  1001488  1004356       4.0
1846  1004354  1004356       2.0
1847  1004355  1004356       4.0
1848  1004356  1004356       0.0

[1849 rows x 3 columns]
