# SVO Sankey Plots

In [1]:
%matplotlib inline

In [2]:
# IMPORTS
import pandas as pd
import networkx as nx
import numpy as np
import plotly.graph_objects as go

# LOAD DATAFRAMES
svos_m = pd.read_csv("../output/svos_m_lem.csv", index_col=0)
svos_w = pd.read_csv("../output/svos_w_lem.csv", index_col=0)

print(svos_m.shape[0], svos_w.shape[0])

80460 26610


A note about the functions below:

- **SVOverbs** reduces a larger SVO dataframe to one subject and the *n* top verbs associated with that subject.

- **sankify** renders the subject and verbs into a form that the Sankey function in Plotly can recognize. For more, see Plotly's [Sankey documenation](https://plotly.com/python-api-reference/generated/plotly.graph_objects.Sankey.html#plotly.graph_objects.Sankey).

In [24]:
def SVOverbs(dataframe, subject, start, end):
    # Create the subject-focused dataframe
    df = dataframe[dataframe["subject"] == subject].groupby(
        ["verb"]).size().reset_index(name='obs').sort_values(
        ['obs'], ascending=False).iloc[start:end]
    # Re-insert a column for the subject
    df["subject"] = subject
    # Re-arrange columns so that they are in a more obvious order
    df = df[["subject", "verb", "obs"]]
    return df

def sankify (df):
    # Merge the two columns of subjects and verbs
    nodes = np.unique(df[["subject", "verb"]], axis=None)
    # Assign each node a unique ID
    nodes = pd.Series(index=nodes, data=range(len(nodes)))
    # Use Ploty's Sankey trace (which is really hard to understand)
    fig = go.Figure(
        go.Sankey(
            node={"label": nodes.index},
            link={
                "source": nodes.loc[df["subject"]],
                "target": nodes.loc[df["verb"]],
                "value": df["obs"],
            },
        )
    )
    fig.update_layout(autosize=False, width=500, height=850)
#     fig.write_image("/Users/jl/Desktop/fig1.svg")
    fig.show()

In [9]:
w_she = SVOverbs(svos_w, "she", 10, 40)
sankify(w_she)

In [10]:
# Let's now do that for "he"
w_he = SVOverbs(svos_w, "he", 10, 40)

# Merge "she" and "he"
w_she_he = pd.concat([w_she, w_he], ignore_index=True)
w_she_he.shape

# And visualize
sankify(w_she_he)

In [25]:
# Now let's look at the differences in S-V pairs for male speakers
# Let's now do that for "he"

m_she = SVOverbs(svos_m, "she", 10, 50)
m_he = SVOverbs(svos_m, "he", 10, 50)

# Merge "she" and "he"
m_she_he = pd.concat([m_she, m_he], ignore_index=True)

# And visualize
sankify(m_she_he)

In [22]:
# The "I" will show the unbalanced nature of the corpus

m_i = SVOverbs(svos_m, "i", 10, 50)
w_i = SVOverbs(svos_w, "i", 10, 50)

m_i.replace(to_replace="i", value="m_i", inplace=True)
w_i.replace(to_replace="i", value="w_i", inplace=True)

# Merge "she" and "he"
i_mw = pd.concat([m_i, w_i], ignore_index=True)

# And visualize
sankify(i_mw)

The code below won't really work because of the imbalance between the two subcorpora. Despite the higher rate of 24% with which sentences with an "I" occur in the women's subcorpus, there are only 6185 SVOs against the 15440 SVOs for the men's subcorpus with a rate of 20%.

In [7]:
# Now let's look at the differences in S-V pairs for male speakers
# Let's now do that for "he"

m = SVOverbs(svos_m, "i", 10, 40)
m_i = m.subject.replace({"i" : "m_i"})
w = SVOverbs(svos_w, "i", 10, 40)
w_i = w.subject.replace({"i" : "w_i"})

# Merge "she" and "he"
mw_i = pd.concat([m_i, w_i], ignore_index=True)

# And visualize
sankify(mw_i)

KeyError: "None of [Index(['subject', 'verb'], dtype='object')] are in the [index]"

In [None]:
# Add "i" to the "she_he" dataset
w_i = SVOverbs(svos_w, "i", 30)
w_she_he_i = pd.concat([w_she_he, w_i], ignore_index=True)

# Save for later
# w_she_he_i.to_csv("../output/w_she_he_i.csv")

In [None]:
sankify(w_she_he_i)

## Character Spaces as Verb-Feature Spaces

The goal in this section is to:

1. Collect all the verbs associated with the specified subjects
2. Weight the verbs (by normalization)
3. Compare the verbs manually
4. Visualize a comparison using PCA or t-SNE

First we explore the total number of verbs involved:

In [None]:
# Collect all the verbs from the women's subcorpus
verbs_w = svos_w.groupby(["verb"]).size().reset_index(name='obs').sort_values(
        ['obs'], ascending=False)

# Select only the verbs that occur more than once
verbs_gt_w = verbs_w[verbs_w.obs > 2]

# What's our counts?
print(f"♀︎: {verbs_w.shape[0]} unique verbs; {verbs_gt_w.shape[0]} occur more than once")

In [None]:
# Repeat for the men's subcorpus
verbs_m = svos_m.groupby(["verb"]).size().reset_index(name='obs').sort_values(
        ['obs'], ascending=False)
verbs_gt_m = verbs_m[verbs_m.obs > 2]

print(f"♂︎: {verbs_m.shape[0]} unique verbs; {verbs_gt_m.shape[0]} occur more than once")

Now we need to grab the verbs associated with the subjects:

In [None]:
# Create a list of the subjects for which we want SVOs
subjects = ['she', 'he', 'i']

# Filter the dataframe
subjects_w = svos_w[svos_w['subject'].isin(subjects)]

# We don't want the objects for this
subjects_w = subjects_w.drop('object', axis=1)

# Count the unique combinations of two columns
subj_w_ct = subjects_w[['subject', 'verb']].value_counts().reset_index(name='count')

# Check our work
subj_w_ct.head()

In [None]:
subj_w_ct.value_counts(subset=['subject', 'verb']).sort_index(ascending=False)

In [None]:
# Repeat for the mens' subcorpus
subjects_m = svos_m[svos_m['subject'].isin(subjects)]
subjects_m = subjects_m.drop('object', axis=1)
subj_m_ct = subjects_m[['subject', 'verb']].value_counts().reset_index(name='count')
subj_m_ct.shape

In [None]:
subj_m_ct.head(10)

In [None]:
# See the total number of verbs above
# This could have been done with verbs_w.shape[0]
subj_m_ct['weight'] = subj_m_ct['count']/5307
subj_w_ct['weight'] = subj_w_ct['count']/3161

In [None]:
subj_w_ct.head()

Now we have 2 dataframes, each with three subjects -- *she*, *he*, and *i*. Each subject has hundreds of verbs associated with it, and each verb has a weight normalized to its subcorpus so that it *should* be comparable to verbs in the other subcorpus. The goal is to see how close or far the six subjects are. 

In [None]:
# Add our columns to attribute subject and verbs to a particular gender
subj_w_ct['speaker'] = "female"
subj_m_ct['speaker'] = "male"

### Sankey Code That Did Not Work

In [None]:
from collections import Counter

# # define the tuple of subjects and verbs
# data = (("subject1", "verb1"),
#         ("subject1", "verb2"),
#         ("subject2", "verb2"),
#         ("subject2", "verb3"),
#         ("subject3", "verb1"))

# # create a list of subjects and a list of verbs
# subjects = [item[0] for item in data]
# verbs = [item[1] for item in data]

# create a dictionary to store the counts of verbs for each subject
verb_counts = dict(Counter(zip(subjects, verbs)))

# create lists to store the source, target, and value for the Sankey plot
source = []
target = []
value = []

# extract the source, target, and value for the Sankey plot from the verb counts dictionary
for subject, verb_count in verb_counts.items():
    source.append(subject)
    for verb, count in verb_count.items():
        target.append(verb)
        value.append(count)

# create the Sankey plot
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = list(set(subjects)) + list(set(verbs)),
      color = "blue"
    ),
    link = dict(
      source = source,
      target = target,
      value = value
    ))])

fig.show()

In [None]:
def sanked (df):
    # Define the nodes and links of the Sankey diagram
#     nodes = ['Node 1', 'Node 2', 'Node 3', 'Node 4']
    nodes = np.unique(df[["subject", "verb"]], axis=None).tolist()
#     links = {'source': [0, 0, 1, 1, 2, 2], 'target': [2, 3, 2, 3, 2, 3], 'value': [8, 4, 4, 2, 2, 2]}
    links = {
                "source": df.subject.tolist(),
                "target": df.verb.tolist(),
                "value": df["obs"],
            }
    # Create the Sankey diagram
    fig = go.Figure(data=[go.Sankey(node=dict(label=nodes), link=links)])
    # Add value labels to the diagram
    fig.data[0].text = [f"{v}" for v in df['obs']]
    # Show the diagram
    fig.show()