In [9]:
# !which -a pip3
!/home/gridsan/ktiwary/.conda/envs/dpi310/bin/pip3 install pandas altair
!/home/gridsan/ktiwary/.conda/envs/dpi310/bin/pip3 install semanticscholar

[0mCollecting semanticscholar
  Downloading semanticscholar-0.8.2-py3-none-any.whl (24 kB)
Collecting tenacity
  Downloading tenacity-8.3.0-py3-none-any.whl (25 kB)
Collecting httpx
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting anyio
  Downloading anyio-4.4.0-py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna
  Downloading idna-3.7-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpcore==1.*
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sniffio
  Downloading sniff

In [10]:
import sys
# Append system path
sys.path = [p for p in sys.path if not p.endswith("../..")]  # Cleans duplicated '../..'
sys.path.insert(0, "../")  # This adds `src` to the path
import os
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows() # Allow using more than 5000 rows, for now
from collections import defaultdict
from helpers import io, filters
from typing import Any


%load_ext autoreload
%autoreload 2

## Helper Functions

Utility functions to process and transform data summaries:

---
```python
def invert_dict_of_lists(
  d: dict[str, list[str]]
) -> dict[str, str]
```
- Inverts a dictionary of lists for easier mapping of constants.
---
```python
def remap_licenses_with_paraphrases(
  summaries: list[dict[str, Any]],
  paraphrases: dict[str, str]
) -> dict[str, Any]
``` 
- Standardizes inconsistent license names in data summaries using predefined paraphrases.
---
```python
def map_license_criteria_multimodal(
  data_summary: list[dict[str, Any]],
  all_constants: dict[str, dict[str, list[str]]]
) -> list[dict[str, Any]]
```
- Maps license criteria for multimodal datasets, resolving them according to predefined constants.
---
```python
def get_country(x: str) -> list[int]
```
- Takes a country name as input and returns a list of ISO3166 codes (mostly, of length 1). It handles a special case that appears in some text annotations ("African Continent" -> list of ISO codes) and logs a warning for any countries not found in the mapping.
---
```python
def gini(array: np.ndarray) -> float:
```
- Takes an array of values and computes the Gini coefficient.
---
```python
def factor_year(
  df: pd.DataFrame,
  column: str = "Year Released",
  min_year: int = 2013
) -> pd.DataFrame:
```
- Converts the year column into a categorical variable (with years before a given value grouped together).
---
```python
def order_by_grouped_permisiveness(
        df: pd.DataFrame,
        group_column: str,
        licensetype_column: str = "License Type",
        permissive_licensetypes: list[str] = ["Commercial"]
) -> pd.Series:
```
- Computes permisiveness (proportion of license types in a given set, by default only those marked `Commercial`) by a given grouping factor and returns an order for that factor.
---
```python
def reduce_categories_to_topk(
    df: pd.DataFrame,
    column: str,
    k: int = 6
) -> pd.DataFrame:
```
- Reduces the number of categories in a column to `k`, with the rest grouped under `Other`. So returns a `DataFrame` with a version of that column with `k + 1` total categories.
---

In [11]:
def invert_dict_of_lists(d: dict[str, list[str]]) -> dict[str, str]:
    """Useful for mapping constants, paraphrases, etc.
    These are normally in the form:
        { "Category": ["item1", "item2", … ] }
    Whereas we want to invert it to:
        { "item1": "Category", "item2": "Category", … }
    """
    inverted = {}
    for k, v in d.items():
        for item in v:
            inverted[item] = k
    return inverted

In [12]:
def remap_licenses_with_paraphrases(
        summaries: list[dict[str, Any]],
        paraphrases: dict[str, str]
    ) -> dict[str, Any]:
    """Map inconsistent license names to shared paraphrases using the constants.
    E.g. "CC-BY-SA 4.0", "CC BY SA 4.0" -> "CC BY-SA 4.0"
    """

    for i, summary in enumerate(summaries):
        for j, license in enumerate(summary["Licenses"]):
            license = license["License"]
            summaries[i]["Licenses"][j]["License"] = paraphrases.get(
                license,
                license
            )
    return summaries


In [13]:
def classify_and_resolve_licenses(
    license_infos: list[tuple[str, str]],
    all_constants: dict[str, dict[str, list[str]]]
) -> list[str]:
    """Function taken from `text_ft_plots.ipynb`"""
    classified_licenses = []
    for (license_name, license_url) in license_infos:
        # Classify an individual license
        classifications = filters.classify_license(license_name, license_url, all_constants)
        classified_licenses.append(classifications)

    # By default, multiple licenses yield to the most restrictive one
    resolved_criteria = filters.resolve_multiple_licenses(classified_licenses)
    return resolved_criteria


def add_license_classes_to_summaries(
    data_summary: list[dict[str, Any]],
    resolved_classes: dict[str, list[str]],
    aggregator: str
):
    """Function taken from `text_ft_plots.ipynb`"""
    # Update DataFrame with columns for use, attribution, share_alike
    for row in data_summary:
        row[f"License Use ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][0]
        row[f"License Attribution ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][1]
        row[f"License Share Alike ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][2]
    return data_summary


def map_license_criteria_multimodal(
    data_summary: list[dict[str, Any]],
    all_constants: dict[str, dict[str, list[str]]]
) -> list[dict[str, Any]]:
    """Variant of `map_license_criteria` that works with multimodal datasets.
    Simplified to only include `Licenses` (not HF, etc.).

    Function adapted from `text_ft_plots.ipynb`.
    """

    # Unpack licenses for each dataset. {uid --> (license_name, license_url)}
    our_uid_to_license_infos = defaultdict(list)

    # Same as ours, but excludes OpenAI Terms:
    our_uid_to_license_infos_no_openai = defaultdict(list)

    for row in data_summary:
        uid = row["Unique Dataset Identifier"]
        for license_info in row["Licenses"]:
            license_name = license_info["License"]
            license_url = license_info.get("License URL", None) # FOR NOW
            our_uid_to_license_infos[uid].append((license_name, license_url))
            if license_info["License"] != "OpenAI":
                our_uid_to_license_infos_no_openai[uid].append((license_name, license_url))

        # If OpenAI was the only license, we add Unspecified so there isn't nothing there.
        if len(our_uid_to_license_infos_no_openai[uid]) == 0:
            our_uid_to_license_infos_no_openai[uid].append(("Unspecified", None))


    # classify and resolve licenses for each dataset and each aggregator
    ours_resolved, ours_openai_resolved = {}, {}
    for uid in our_uid_to_license_infos.keys():
        ours_resolved[uid] = classify_and_resolve_licenses(our_uid_to_license_infos[uid], all_constants)
        ours_openai_resolved[uid] = classify_and_resolve_licenses(our_uid_to_license_infos_no_openai[uid], all_constants)


    data_summary = add_license_classes_to_summaries(data_summary, ours_resolved, "DataProvenance")
    data_summary = add_license_classes_to_summaries(data_summary, ours_openai_resolved, "DataProvenance IgnoreOpenAI")

    return data_summary

In [14]:
def gini(array: np.ndarray) -> float:
    """Calculate the Gini coefficient of a numpy array.

    Implementation taken from: https://github.com/oliviaguest/gini
    """
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))

In [15]:
def factor_year(
    df: pd.DataFrame,
    column: str = "Year Released",
    min_year: int = 2013
) -> pd.DataFrame:
    """Transform the year column into a categorical column.

    Years before `min_year` are grouped into a category, i.e. "<`min_year`" (e.g. )
    """
    df = df.copy()

    min_yeartext = "<%d" % min_year
    max_year = df[column].max()

    df[column] = df[column].map(
        lambda x: min_yeartext if (x < min_year) else str(x)
    )

    order = [min_yeartext, *map(str, range(min_year, max_year + 1))]

    df[column] = pd.Categorical(
        df[column],
        categories=order,
        ordered=True
    )

    return df, order

In [16]:
def order_by_grouped_permisiveness(
        df: pd.DataFrame,
        group_column: str,
        licensetype_column: str = "License Type",
        permissive_licensetypes: list[str] = ["Commercial"]
) -> pd.Series:
    """Given a DataFrame, group it by `group_column` and calculate the permissiveness of each group.

    Permisiveness is calculated as the proportion of licenses that are in `permissive_licensetypes`.
    """
    permisiveness = df.groupby(group_column).apply(
        lambda x: (x[licensetype_column].isin(permissive_licensetypes)).mean()
    ).reset_index(name="Permissiveness")

    permisiveness_order = permisiveness.sort_values(by="Permissiveness")[group_column].tolist()

    return permisiveness_order

In [17]:
def reduce_categories_to_topk(
    df: pd.DataFrame,
    column: str,
    k: int = 6
) -> pd.DataFrame:
    """Reduce the number of categories in a column to the top `k` categories.

    The rest are grouped into an "Other" category.
    """
    df = df.copy()
    topk = df[column].value_counts().head(k).index.tolist()
    df[column] = df[column].map(
        lambda x: x if x in topk else "Other"
    )

    return df

## Read Constants and Summaries

Load constants and data summaries from JSON files. Constants provide mappings and criteria for licenses, creator groups, various other categories. Data summaries contain modality-specific information about datasets.

- `all_constants`: Dictionary containing all predefined constants.
- `speech_summaries`: Data summaries for speech.

In [39]:
all_constants = io.read_all_constants("../../constants/")
# speech_summaries = io.read_data_summary_json("../../data_summaries-speech/")
speech_summaries = io.read_data_summary_json("/home/gridsan/ktiwary/src/dpi-ktiwary-fork/data_summaries/video/test-0520")

# license_paraphrases = invert_dict_of_lists(all_constants["LICENSE_PARAPHRASES"])
license_paraphrases = invert_dict_of_lists(all_constants["LICENSE_CLASSES"])
speech_summaries = map_license_criteria_multimodal(
    remap_licenses_with_paraphrases(
        speech_summaries,
        license_paraphrases
    ),
    all_constants
)

df_speech = pd.DataFrame(speech_summaries)
df_speech, YEARS_ORDER = factor_year(df_speech)

In [40]:
# Overall Gini coefficient (hours by dataset)
gini(df_speech["Video Hours"].values)

0.9426121290947573

In [47]:
# Plotting constants
FONT_SIZE = 16
LEGEND_POSITION = "bottom"
PLOT_TOFILE = True # Whether and where to output plots
# PLOT_DIR = "~/Dropbox (MIT)/dpi-plotsspeech/"
PLOT_DIR = "/home/gridsan/ktiwary/src/dpi-ktiwary-fork/dpi-plots"
PLOT_PPI = 300
MAX_LABELLIMIT = 1000 # Large number to avoid label summarization in plots

if PLOT_TOFILE:
    PLOT_DIR = os.path.expanduser(PLOT_DIR)
    os.makedirs(PLOT_DIR, exist_ok=True)

## License Use by Language Family and Source Category

In [42]:
# Plotting constants
LICENSE_ORDER = ["Non-Commercial/Academic", "Unspecified", "Commercial"]
LICENSE_PALETTE = ["#e04c71", "#e0cd92", "#82b5cf"]
LICENSE_PLOTW = 600
LICENSE_PLOTH = 200

In [43]:
# Map to main DPI license types
df_speech["License Type"] = df_speech["License Use (DataProvenance)"].map({
    "academic-only": "Non-Commercial/Academic",
    "non-commercial": "Non-Commercial/Academic",
    "unspecified": "Unspecified",
    "commercial": "Commercial"
})

df_speech["License Type"] = pd.Categorical(
    df_speech["License Type"],
    categories=LICENSE_ORDER,
    ordered=True
)
df_speech = df_speech.sort_values(by="License Type")

In [44]:
# Remap language families for condensed plots
df_speechlanguages = df_speech.explode("Video Hours")
df_speechlanguages["Video Hours"] = df_speechlanguages["Video Hours"].astype(float)

df_speechlanguages.keys()

Index(['Unique Dataset Identifier', 'Collection', 'Collection URL',
       'Dataset Name', 'Paper Title', 'Paper URL', 'GitHub URL',
       'Hugging Face URL', 'Papers with Code URL', 'ArXiv URL',
       'Semantic Scholar Corpus ID', 'Year Released', 'Text Sources',
       'Video Task', 'Licenses', 'Creators', 'Countries',
       'License Verified By', 'Video Hours', 'Taken Down', 'Video Sources',
       'License Use (DataProvenance)', 'License Attribution (DataProvenance)',
       'License Share Alike (DataProvenance)',
       'License Use (DataProvenance IgnoreOpenAI)',
       'License Attribution (DataProvenance IgnoreOpenAI)',
       'License Share Alike (DataProvenance IgnoreOpenAI)', 'License Type'],
      dtype='object')

In [68]:
base = alt.Chart(df_speechlanguages).mark_bar().encode(
    x=alt.X(
        "Video Task:N",
        title="Video Task",
        sort=None,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "License Type:N",
        scale=alt.Scale(
            domain=LICENSE_ORDER,
            range=LICENSE_PALETTE
        ),
        title="License Type"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_speechlanguages).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Video Task:N",
        title="Video Task to License Type",
        sort=None
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    labelLimit=MAX_LABELLIMIT
)

# if PLOT_TOFILE:
#     chart.save(
#         os.path.join(PLOT_DIR, "video_task-licenses.png"),
#         ppi=PLOT_PPI
#     )

chart

In [None]:
base = alt.Chart(df_speechlanguages).mark_bar().encode(
    x=alt.X(
        "Video Hours:N",
        title="Video Task",
        sort=None,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "License Type:N",
        scale=alt.Scale(
            domain=LICENSE_ORDER,
            range=LICENSE_PALETTE
        ),
        title="License Type"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_speechlanguages).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Video Task:N",
        title="Video Task to License Type",
        sort=None
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    labelLimit=MAX_LABELLIMIT
)

# if PLOT_TOFILE:
#     chart.save(
#         os.path.join(PLOT_DIR, "video_task-licenses.png"),
#         ppi=PLOT_PPI
#     )

chart

In [64]:
!/home/gridsan/ktiwary/.conda/envs/dpi310/bin/pip install altair_viewer

[0m

In [None]:
INCLUDE_TOP_N_CATEGORIES = 10 # Number of top categories to include, rest will be grouped as "Other"

df_sources = df_speech.explode("Source Category")
df_sources = reduce_categories_to_topk(df_sources, "Source Category", INCLUDE_TOP_N_CATEGORIES)

sourcecategory_order = order_by_grouped_permisiveness(df_sources, "Source Category")

df_sources["Source Category"] = pd.Categorical(
    df_sources["Source Category"],
    categories=sourcecategory_order,
    ordered=True
)

df_sources = df_sources.sort_values(by="Source Category")

In [None]:
base = alt.Chart(
    df_sources
).mark_bar().encode(
    x=alt.X(
        "Source Category:N",
        title="Source Category",
        axis=alt.Axis(labelAngle=-30),
        sort=sourcecategory_order
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "License Type:N",
        scale=alt.Scale(
            domain=LICENSE_ORDER,
            range=LICENSE_PALETTE
        ),
        title="License Type"
    )
).properties(
    width=800,
    height=100
)

text = alt.Chart(df_sources).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Source Category:N",
        title="Source Category",
        sort=sourcecategory_order
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
        labelFontSize=FONT_SIZE,
        titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    labelLimit=MAX_LABELLIMIT
).configure_header(
    titleFontSize=FONT_SIZE,
    labelFontSize=FONT_SIZE
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_sourcecategories-licenses.png"),
        ppi=PLOT_PPI
    )

chart

## Sources by Language Family

In [None]:
INCLUDE_TOP_N_CATEGORIES = 6 # Number of top categories to include, rest will be grouped as "Other"

# # Further unlist the categories of sources
df_speechlanguagessources = reduce_categories_to_topk(
    df_speechlanguages.explode("Source Category"),
    "Source Category",
    INCLUDE_TOP_N_CATEGORIES
)

In [None]:
base = alt.Chart(
    df_speechlanguagessources
).mark_bar().encode(
    x=alt.X(
        "Language Families:N",
        title="Language Family",
        sort=languagefamily_order,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Source Category:N",
        title="Source Category"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_speechlanguagessources).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Language Families:N",
        title="Language Family",
        sort=languagefamily_order
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_languagefamilies-sources.png"),
        ppi=PLOT_PPI
    )

chart

## Hours by Language Family

In [None]:
df_speechlanguageshours = df_speechlanguages.copy()

# We filter out the large-scale dataset YODAS, which has a large number of hours and a large number of languages
# Since we don't subdivide by language, it would skew results
df_speechlanguageshours = df_speechlanguageshours[df_speechlanguageshours["Unique Dataset Identifier"] != "yodas"]

df_speechlanguageshours = df_speechlanguageshours.groupby("Language Families")["Hours"].sum().reset_index(name="Total Hours")
df_speechlanguageshours = df_speechlanguageshours.sort_values(by="Total Hours")
languagefamily_hourorder = df_speechlanguageshours["Language Families"][::-1].tolist()

In [None]:
alt.Chart(df_speechlanguageshours).mark_bar().encode(
    x=alt.X(
        "Language Families:N",
        title="Language Family",
        sort=languagefamily_hourorder,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "Total Hours:Q",
        # Hours of audio from datasets with each language family represented
        # Within such datasets, we may not have specific hours for language families
        title="Represented Hours"
    )
).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).properties(
    width=600,
    height=200
)

## Gini Coefficient Across Languages by (Cumulative) Total Hours

In [None]:
# Preprocess for year labels and order
df_speechlanguagesn = df_speechlanguages.copy()

# Subdivide hours evenly across the languages given in each dataset
df_speechlanguagesn["Hours"] = df_speechlanguagesn["Hours"] / df_speechlanguagesn["Languages"].apply(len)
df_speechlanguageshours = df_speechlanguagesn.explode("Languages")

df_speechlanguageshours = df_speechlanguageshours.sort_values(by="Year Released")

In [None]:
# Gini coefficient for hours across languages
speechlanguages_totalhours = df_speechlanguageshours.explode("Languages").groupby("Languages")["Hours"].sum().reset_index(name="Total Hours")

gini(speechlanguages_totalhours["Total Hours"].values)

In [None]:
# Get the cumulative hours by language over time
df_speechlanguagescumulativehours = df_speechlanguageshours.groupby(
    ["Year Released", "Languages"]
)["Hours"].sum().groupby(
    "Languages"
).cumsum().reset_index(name="Cumulative Hours")

# Calculate Gini coefficient for cumulative hours by language
df_speechlanguagescumulativehoursgini = df_speechlanguagescumulativehours.groupby(
    "Year Released"
).apply(
    lambda x: gini(x["Cumulative Hours"].values)
).reset_index(name="Gini Coefficient")

In [None]:
chart = alt.Chart(
    df_speechlanguagescumulativehoursgini
).mark_line().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "Gini Coefficient:Q",
        title="Gini (Cumulative)"
    )
).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).properties(
    width=600,
    height=200
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_languages-giniyears.png"),
        ppi=PLOT_PPI
    )

chart

## Source Category by Year

In [None]:
INCLUDE_TOP_N_CATEGORIES = 6
df_speechsourceyears = df_speech.explode("Source Category")
df_speechsourceyears = reduce_categories_to_topk(df_speechsourceyears, "Source Category", INCLUDE_TOP_N_CATEGORIES)

df_speechsourceyears = df_speechsourceyears.sort_values(by="Year Released")

In [None]:
base = alt.Chart(
    df_speechsourceyears
).mark_bar().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Source Category:N",
        title="Source Category"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_speechsourceyears).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)


if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_sourcecategories-years.png"),
        ppi=PLOT_PPI
    )

chart

## Total Hours by Source Category (Cumulative)

In [None]:
INCLUDE_TOP_N_CATEGORIES = 6

df_speechsourceyears = df_speech.explode("Source Category")
df_speechsourceyears = reduce_categories_to_topk(df_speechsourceyears, "Source Category", INCLUDE_TOP_N_CATEGORIES)

df_speechsourceyearscumulativehours = df_speechsourceyears.groupby(
    ["Year Released", "Source Category"]
)["Hours"].sum().groupby(
    "Source Category"
).cumsum().reset_index(name="Cumulative Hours")

df_speechsourceyearscumulativehours = df_speechsourceyearscumulativehours.sort_values(by="Year Released")

In [None]:
chart = alt.Chart(
    df_speechsourceyearscumulativehours
).mark_line().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "Cumulative Hours:Q",
        title="Cumulative Hours",
        scale=alt.Scale(type="symlog")
    ),
    color=alt.Color(
        "Source Category:N",
        title="Source Category"
    )
).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    symbolStrokeWidth=4,
    labelLimit=MAX_LABELLIMIT
).properties(
    width=600,
    height=200
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_sourcecategories-cumulativehours.png"),
        ppi=PLOT_PPI
    )

chart

## Source Category (YouTube or Other) by License Type

In [None]:
# By count
df_counts_by_license_source = df_speech.explode("Source").groupby(["License Type", "Source"]).size().reset_index(name="Count")
df_counts_by_license_source = df_counts_by_license_source.sort_values(by="Count")
df_counts_by_license_source["YouTube"] = df_counts_by_license_source["Source"].map(
    lambda x: "YouTube" if "youtube" in x.lower() else "Other"
)

# By hours
df_hours_by_license_source = df_speech.explode("Source").groupby(["License Type", "Source"])["Hours"].sum().reset_index(name="Total Hours")
df_hours_by_license_source = df_hours_by_license_source.sort_values(by="Total Hours")
df_hours_by_license_source["YouTube"] = df_hours_by_license_source["Source"].map(
    lambda x: "YouTube" if "youtube" in x.lower() else "Other"
)

In [None]:
chart_bycount = alt.Chart(df_counts_by_license_source).mark_bar().encode(
    x=alt.X(
        "Count:Q",
        title="Count"
    ),
    y=alt.Y(
        "License Type:N",
        title="",
        sort=LICENSE_ORDER,
        axis=alt.Axis(labelLimit=MAX_LABELLIMIT)
    ),
    color=alt.Color(
        "YouTube:N",
        title="Source"
    )
).properties(
    width=800,
    height=100
)

chart_byhour = alt.Chart(df_hours_by_license_source).mark_bar().encode(
    x=alt.X(
        "Total Hours:Q",
        title="Total Hours"
    ),
    y=alt.Y(
        "License Type:N",
        title="License Type",
        sort=LICENSE_ORDER,
        axis=alt.Axis(labelLimit=MAX_LABELLIMIT)
    ),
    color=alt.Color(
        "YouTube:N",
        title="Source"
    )
).properties(
    width=800,
    height=100
)

chart = alt.vconcat(chart_bycount, chart_byhour).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    labelLimit=MAX_LABELLIMIT
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_license-source.png"),
        ppi=PLOT_PPI
    )

chart

## Creator Categories by Year

Note: we use the original annotations here instead of the DPI constants, for a different view.

In [None]:
df_speechcategoriesyears = df_speech.explode("Creator Categories")
df_speechcategoriesyears = df_speechcategoriesyears.sort_values(by="Year Released")

In [None]:
base = alt.Chart(
    df_speechcategoriesyears
).mark_bar().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Creator Categories:N",
        title="Creator Category"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_speechsourceyears).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_categories-years.png"),
        ppi=PLOT_PPI
    )

chart

## Table of License Type

In [None]:
licensetype_counts = df_speech["License Type"].value_counts()
df_licensetypes = pd.concat([
    licensetype_counts,
    (licensetype_counts / licensetype_counts.sum()).round(4) * 100
], axis=1)

df_licensetypes.columns = ["Count", "Pct."]

df_licensetypes

## Tables of YouTube Dataset LicenseTypes

In [None]:
# By count
df_speech["YouTube"] = df_speech["Source"].map(
    lambda x: "YouTube" if any("youtube" in xi.lower() for xi in x) else "Other"
)

df_youtube = df_speech.groupby(["License Type", "YouTube"]).size().reset_index(name="Count")
df_youtube = df_youtube.sort_values(by="Count")
df_youtube["Pct."] = df_youtube.groupby("License Type")["Count"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_youtube

In [None]:
# By hours
df_youtubehours = df_speech.groupby(["License Type", "YouTube"])["Hours"].sum().reset_index(name="Total Hours")
df_youtubehours = df_youtubehours.sort_values(by="Total Hours")
df_youtubehours["Pct."] = df_youtubehours.groupby("License Type")["Total Hours"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_youtubehours