In [6]:
# Import the important packages for running this analysis
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Load the dataset
file_path = r"C:\Users\12012\Downloads\UpdatedVersionCSV.csv"  
df = pd.read_csv(file_path)



In [7]:
# Identify the composite columns that are to be compared in the cosine similarity function
# The ones that are included can also be reduced, which would focus the similarity to certain variables.
composite_columns = [
    "Systems Thinking Score", "Trust Score", "Conspiracy", "Complexity", 
    "Openness", "Conscienciousness", "Extroversion", "Agreeableness", "Neuroticism"
]

In [8]:

# Standardize the data for better similarity measurements
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[composite_columns]), columns=composite_columns)

# Compute similarity matrix
similarity_matrix = cosine_similarity(df_scaled)

# Convert similarity matrix into a graph
G = nx.Graph()
num_respondents = df.shape[0]

In [9]:
# Apply Threshold-based-filtering:
edge_threshold = 0.8  # Set the threshold yourself, or keep .8 as the default. Higher threshold will reduce the number of edges.
for i in range(num_respondents):
    for j in range(i + 1, num_respondents):  # Avoid duplicate edges
        if similarity_matrix[i, j] > edge_threshold:
            G.add_edge(i, j, weight=similarity_matrix[i, j])

In [10]:
# Function to color nodes (respondents) based on columns: Gender or Election
def get_node_colors(df, variable, G):
    color_map = {}
    for i in G.nodes(): 
        if variable == "Gender":
            if df.loc[i, "Gender"] == "Male":
                color_map[i] = "blue"
            elif df.loc[i, "Gender"] == "Female":
                color_map[i] = "pink"
            else:
                color_map[i] = "yellow"
        elif variable == "Election":
            if df.loc[i, "Elections: During the 2022 election I voted:"] == "Republican":
                color_map[i] = "red"
            elif df.loc[i, "Elections: During the 2022 election I voted:"] == "Democrat":
                color_map[i] = "blue"
            else:
                color_map[i] = "gray"
        else:
            color_map[i] = "gray"  # in the case that a variable is not otherwise recognized, color it gray.
    return [color_map[node] for node in G.nodes()]  

In [11]:
# User's choice between variables Gender or Election, they respond by typing it into a box, Caps for first letter.
print("Select a variable for node coloring: Gender or Election")
selected_variable = input("Enter your choice: ")
node_colors = get_node_colors(df, selected_variable, G)

Select a variable for node coloring: Gender or Election
Enter your choice: Gender


In [12]:
# Create interactive network visualization using Plotly
pos = nx.spring_layout(G)
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    marker=dict(size=10, color=node_colors),
    text=[f'Respondent {node}' for node in G.nodes()],
    hoverinfo='text')

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title=f"Network of Similar Respondents (Colored by {selected_variable})",
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0,l=0,r=0,t=40)))
fig.show()


In [13]:
# Function to identify similar others to a specific respondent:

def get_similar_respondents(index, top_n=5): #Default is to list 5 others with the most similar score, but this can be expended
    similarities = list(enumerate(similarity_matrix[index]))
    similarities.sort(key=lambda x: x[1], reverse=True)  # Sort by similarity score from high to low
    return similarities[1:top_n+1]  # Exclude self-matching (matching themselves, with whom they would be a +1, a perfect similarity.)

# Example usage
respondent_id = 25  # Add a respondent here (being fixed to be based on survey ID instead of list order by row).
top_similar = get_similar_respondents(respondent_id, top_n=5) #if changed above, change here as well
print(f"Most similar others to respondent {respondent_id}:")
for respondent, score in top_similar:
    print(f"Respondent {respondent}: Similarity Score = {score:.4f}")


Most similar others to respondent 25:
Respondent 348: Similarity Score = 0.9175
Respondent 254: Similarity Score = 0.8545
Respondent 9: Similarity Score = 0.8228
Respondent 540: Similarity Score = 0.8189
Respondent 219: Similarity Score = 0.7957
