In [62]:
import plotly.graph_objects as go
import pandas as pd


In [63]:
data = pd.read_csv("Combined_crash.csv")

In [64]:
data.head()

Unnamed: 0,Crash_Number,City_Town_Name,Crash_Date,Crash_Time,Crash_Severity,Maximum_Injury_Severity_Reported,Number_of_Vehicles,Total_Nonfatal_Injuries,Total_Fatal_Injuries,Manner_of_Collision,...,Ambient_Light,Weather_Condition,At_Roadway_Intersection,Distance_From_Nearest_Roadway_Intersection,Distance_From_Nearest_Milemarker,Distance_From_Nearest_Exit,Distance_From_Nearest_Landmark,Non_Motorist_Type,X_Cooordinate,Y_Cooordinate
0,4486779,CAMBRIDGE,01-Jan-2018,3:45 AM,Property damage only (none injured),No injury,2,0,0,Angle,...,Dark - lighted roadway,Clear,FRESH POND PARKWAY / BRATTLE STREET /,FRESH POND PARKWAY / BRATTLE STREET /,,,,,229587.796941,902911.68741
1,4482195,CAMBRIDGE,01-Jan-2018,5:11 AM,Non-fatal injury,Non-fatal injury - Non-incapacitating,2,1,0,Rear-end,...,Dark - lighted roadway,Other,,MASSACHUSETTS AVENUE / AMHERST STREET,,,,,233487.179661,901017.897931
2,4492670,CAMBRIDGE,01-Jan-2018,7:58 AM,Property damage only (none injured),No injury,1,0,0,Single vehicle crash,...,Daylight,Clear,RAMP-RTS 3 SB/2 EB (MEM DR) TO RT 2 Rte 3A E,RAMP-RTS 3 SB/2 EB (MEM DR) TO RT 2 Rte 3A E,,,RAMP TO REID ROTARY,,231904.957612,900595.416581
3,4477598,CAMBRIDGE,01-Jan-2018,11:16 AM,Property damage only (none injured),No injury,2,0,0,Angle,...,Daylight,Clear,CHESTNUT STREET / PLEASANT STREET,CHESTNUT STREET / PLEASANT STREET,,,,,231833.937359,901062.687472
4,4489949,CAMBRIDGE,01-Jan-2018,9:42 PM,Property damage only (none injured),No injury,1,0,0,Single vehicle crash,...,Dark - unknown roadway lighting,Clear,ORCHARD STREET / TENNEY STREET,ORCHARD STREET / TENNEY STREET,,,,,231150.825422,904721.96786


In [65]:
def _code_mapping(df, src, targ):
    """ Helper method used by make_sankey.
    (Not to be called outside of this library)
    """
    # Get the distinct labels
    labels = list(set(df[src]).union(set(df[targ])))

    # Generate integer codes
    codes = list(range(len(labels)))

    # Create a label-to-code mapping
    lc_map = dict(zip(labels, codes))

    # Substitute labels for codes in the dataframe
    df = df.replace({src: lc_map, targ: lc_map})

    return df, labels


In [66]:
def make_sankey(df, *cols, vals=None, dropdown_col=None, **kwargs):
    """
    Generates a Sankey diagram, optionally with an embedded dropdown filter.

    :param df: Input pandas DataFrame
    :param cols: Column names representing categorical steps
    :param vals: Name of the column representing flow values
    :param dropdown_col: Optional column name to add as a dropdown filter (e.g., 'Manner_of_Collision')
    :param kwargs: Reserved for future flexibility
    """
    if vals is None:
        raise ValueError("You must specify the 'vals' argument for flow values.")
    
    def generate_trace(sub_df):
        # Convert each step in the chain to Source-Target pairs
        data_frames = []
        for i in range(len(cols) - 1):
            src, tgt = cols[i], cols[i + 1]
            temp = sub_df[[src, tgt, vals]].rename(columns={src: "Source", tgt: "Target", vals: "Values"})
            data_frames.append(temp)

        # Combine all source target value pairs and aggregate their values
        sankey_data = pd.concat(data_frames).groupby(["Source", "Target"], as_index=False).sum()

        # Maps the categorical labels to numerical codes for Sankey format
        sankey_data, labels = _code_mapping(sankey_data, "Source", "Target")
        return sankey_data, labels

    # If no dropdown — just show one full Sankey diagram
    if dropdown_col is None:
        sankey_data, labels = generate_trace(df)
        link = dict(
            source=sankey_data["Source"],
            target=sankey_data["Target"],
            value=sankey_data["Values"],
            color="#FF6347"  # light red links
        )
        node = dict(
            label=labels,
            color=["#FF6347"] * len(labels)  
        )
        fig = go.Figure(go.Sankey(link=link, node=node))
        fig.update_layout(title_text="Sankey Diagram", font_size=12)
        fig.show()
        return

    # Dropdown mode — gets the filtered versions for each unique value
    unique_values = sorted(df[dropdown_col].dropna().unique())
    traces = []
    labels_list = []

    for val in unique_values:
        sub_df = df[df[dropdown_col] == val]
        sankey_data, labels = generate_trace(sub_df)
        traces.append(sankey_data)
        labels_list.append(labels)

    # dropdown for the Sankey
    initial = 0
    fig = go.Figure(go.Sankey(
        node=dict(
            label=labels_list[initial],
            # makes them a specific shade of red
            color=["#FF6347"] * len(labels_list[initial])  
        ),
        link=dict(
            source=traces[initial]['Source'],
            target=traces[initial]['Target'],
            value=traces[initial]['Values'],
            color="lightgray"  # links neutral by default
        )
    ))

    # Dropdown buttons for switching categories
    buttons = []
    for i, val in enumerate(unique_values):
        buttons.append(dict(
            label=val,
            method="update",
            args=[
                {
                    "link.source": [traces[i]['Source']],
                    "link.target": [traces[i]['Target']],
                    "link.value": [traces[i]['Values']],
                    
                     # consistent neutral link color
                    "link.color": ["lightgray"], 
                    "node.label": [labels_list[i]],
                    "node.color": [["#FF6347"] * len(labels_list[i])]
                },
                {"title": f"Sankey Diagram — {dropdown_col}: {val}"}
            ]
        ))

    # Layout styling and positioning for dropdown
    fig.update_layout(
        title_text=f"Sankey Diagram — {dropdown_col}: {unique_values[initial]}",
        font_size=12,
        updatemenus=[dict(
            buttons=buttons,
            direction="down",
            showactive=True,
            x=1.1,
            y=1.15,
            xanchor='left',
            yanchor='top'
        )]
    )

    fig.show()


In [67]:
# city population dictionary
city_pops = {
    'BOSTON': 675000,
    'CAMBRIDGE': 118000,
    'SOMERVILLE': 81000
}


In [68]:
# Apply scaled weights based on population
data['Scaled_Weight'] = (
    1 / data['City_Town_Name'].map(city_pops) * 10000
)

In [69]:
# makes the saneky diagram
make_sankey(
    data,
    'City_Town_Name',
    'Manner_of_Collision',
    'Crash_Severity',
    vals='Scaled_Weight',
    dropdown_col='Manner_of_Collision'
)

